[mlpack] 311/324: X tree commit
Barak A. Pearlmutter
barak+git at cs.nuim.ie
Sun Aug 17 08:22:21 UTC 2014
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch svn-trunk
in repository mlpack.
commit 1af30817cecc4b3d76b4acd9827df1409a902888
Author: andrewmw94 <andrewmw94 at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date: Thu Aug 14 14:42:43 2014 +0000
X tree commit
git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@17024 9d5b8971-822b-0410-80eb-d18c1038ef23
---
src/mlpack/core/tree/CMakeLists.txt | 4 +
src/mlpack/core/tree/rectangle_tree.hpp | 1 +
.../tree/rectangle_tree/r_star_tree_split_impl.hpp | 4 +-
.../core/tree/rectangle_tree/rectangle_tree.hpp | 25 +
.../tree/rectangle_tree/rectangle_tree_impl.hpp | 3 +
.../core/tree/rectangle_tree/x_tree_split.hpp | 79 ++-
.../core/tree/rectangle_tree/x_tree_split_impl.hpp | 689 +++++++++++++++++++++
src/mlpack/tests/rectangle_tree_test.cpp | 93 +++
8 files changed, 896 insertions(+), 2 deletions(-)
diff --git a/src/mlpack/core/tree/CMakeLists.txt b/src/mlpack/core/tree/CMakeLists.txt
index a4bd182..6d9fff8 100644
--- a/src/mlpack/core/tree/CMakeLists.txt
+++ b/src/mlpack/core/tree/CMakeLists.txt
@@ -42,6 +42,10 @@ set(SOURCES
rectangle_tree/r_tree_descent_heuristic_impl.hpp
rectangle_tree/r_star_tree_descent_heuristic.hpp
rectangle_tree/r_star_tree_descent_heuristic_impl.hpp
+ rectangle_tree/r_star_tree_split.hpp
+ rectangle_tree/r_star_tree_split_impl.hpp
+ rectangle_tree/x_tree_split.hpp
+ rectangle_tree/x_tree_split_impl.hpp
statistic.hpp
traversal_info.hpp
tree_traits.hpp
diff --git a/src/mlpack/core/tree/rectangle_tree.hpp b/src/mlpack/core/tree/rectangle_tree.hpp
index 2270a82..1760281 100644
--- a/src/mlpack/core/tree/rectangle_tree.hpp
+++ b/src/mlpack/core/tree/rectangle_tree.hpp
@@ -22,5 +22,6 @@
#include "rectangle_tree/r_tree_descent_heuristic.hpp"
#include "rectangle_tree/r_star_tree_descent_heuristic.hpp"
#include "rectangle_tree/traits.hpp"
+#include "rectangle_tree/x_tree_split.hpp"
#endif
diff --git a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
index 75d1e34..33d20b4 100644
--- a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
@@ -262,7 +262,7 @@ bool RStarTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(
return true;
}
-
+ /*
// If we haven't yet reinserted on this level, we try doing so now.
if(relevels[tree->TreeDepth()]) {
relevels[tree->TreeDepth()] = false;
@@ -321,6 +321,8 @@ bool RStarTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(
return false;
}
+
+*/
int bestOverlapIndexOnBestAxis = 0;
diff --git a/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp b/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp
index cc5d205..e9b144c 100644
--- a/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/rectangle_tree.hpp
@@ -39,6 +39,23 @@ template<typename SplitType,
typename MatType = arma::mat>
class RectangleTree
{
+ public:
+ /**
+ * The X tree requires that the tree records it's "split history". To make this easy, we
+ * use the following structure.
+ */
+ typedef struct SplitHistoryStruct {
+ int lastDimension;
+ std::vector<bool> history;
+ SplitHistoryStruct(int dim):
+ history(dim)
+ {
+ for(int i = 0; i < dim; i++)
+ history[i] = false;
+ }
+
+ } SplitHistoryStruct;
+
private:
//! The max number of child nodes a non-leaf node can have.
size_t maxNumChildren;
@@ -66,6 +83,8 @@ class RectangleTree
HRectBound<> bound;
//! Any extra data contained in the node.
StatisticType stat;
+ //! A struct to store the "split history" for X trees.
+ SplitHistoryStruct splitHistory;
//! The distance from the centroid of this node to the centroid of the parent.
double parentDistance;
//! The discance to the furthest descendant, cached to speed things up.
@@ -78,6 +97,7 @@ class RectangleTree
MatType* localDataset;
public:
+
//! So other classes can use TreeType::Mat.
typedef MatType Mat;
@@ -233,6 +253,11 @@ class RectangleTree
const StatisticType& Stat() const { return stat; }
//! Modify the statistic object for this node.
StatisticType& Stat() { return stat; }
+
+ //! Return the split history object of this node.
+ const SplitHistoryStruct& SplitHistory() const { return splitHistory; }
+ //! Modify the split history object of this node.
+ SplitHistoryStruct& SplitHistory() { return splitHistory; }
//! Return whether or not this node is a leaf (true if it has no children).
bool IsLeaf() const;
diff --git a/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp b/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp
index b344876..485237e 100644
--- a/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/rectangle_tree_impl.hpp
@@ -38,6 +38,7 @@ count(0),
maxLeafSize(maxLeafSize),
minLeafSize(minLeafSize),
bound(data.n_rows),
+splitHistory(bound.Dim()),
parentDistance(0),
dataset(data),
points(maxLeafSize + 1), // Add one to make splitting the node simpler.
@@ -69,6 +70,7 @@ count(0),
maxLeafSize(parentNode->MaxLeafSize()),
minLeafSize(parentNode->MinLeafSize()),
bound(parentNode->Bound().Dim()),
+splitHistory(bound.Dim()),
parentDistance(0),
dataset(parentNode->Dataset()),
points(maxLeafSize + 1), // Add one to make splitting the node simpler.
@@ -97,6 +99,7 @@ RectangleTree<SplitType, DescentType, StatisticType, MatType>::RectangleTree(
maxLeafSize(other.MaxLeafSize()),
minLeafSize(other.MinLeafSize()),
bound(other.bound),
+ splitHistory(other.SplitHistory()),
parentDistance(other.ParentDistance()),
dataset(other.dataset),
points(other.Points()),
diff --git a/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp b/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp
index 8d1c8b6..a6800ac 100644
--- a/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/x_tree_split.hpp
@@ -1 +1,78 @@
-
+/**
+ * @file x_tre_split.hpp
+ * @author Andrew Wells
+ *
+ * Defintion of the XTreeSplit class, a class that splits the nodes of an X tree, starting
+ * at a leaf node and moving upwards if necessary.
+ */
+#ifndef __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_HPP
+#define __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_HPP
+
+#include <mlpack/core.hpp>
+
+namespace mlpack {
+namespace tree /** Trees and tree-building procedures. */ {
+
+/**
+ * A Rectangle Tree has new points inserted at the bottom. When these
+ * nodes overflow, we split them, moving up the tree and splitting nodes
+ * as necessary.
+ */
+template<typename DescentType,
+ typename StatisticType,
+ typename MatType>
+class XTreeSplit
+{
+public:
+
+/**
+ * The X-tree paper says that a maximum allowable overlap of 20% works well.
+ */
+const static double MAX_OVERLAP = 0.2;
+
+/**
+ * Split a leaf node using the algorithm described in "The R*-tree: An Efficient and Robust Access method
+ * for Points and Rectangles." If necessary, this split will propagate
+ * upwards through the tree.
+ */
+static void SplitLeafNode(RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree, std::vector<bool>& relevels);
+
+/**
+ * Split a non-leaf node using the "default" algorithm. If this is a root node, the
+ * tree increases in depth.
+ */
+static bool SplitNonLeafNode(RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree, std::vector<bool>& relevels);
+
+private:
+/**
+ * Class to allow for faster sorting.
+ */
+class sortStruct {
+public:
+ double d;
+ int n;
+};
+
+/**
+ * Comparator for sorting with sortStruct.
+ */
+static bool structComp(const sortStruct& s1, const sortStruct& s2) {
+ return s1.d < s2.d;
+}
+
+/**
+ * Insert a node into another node.
+ */
+static void InsertNodeIntoTree(
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* destTree,
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* srcNode);
+
+};
+
+}; // namespace tree
+}; // namespace mlpack
+
+// Include implementation
+#include "x_tree_split_impl.hpp"
+
+#endif
\ No newline at end of file
diff --git a/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp b/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp
index 8d1c8b6..1ea4018 100644
--- a/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/x_tree_split_impl.hpp
@@ -1 +1,690 @@
+/**
+ * @file x_tree_split_impl.hpp
+ * @author Andrew Wells
+ *
+ * Implementation of class (XTreeSplit) to split a RectangleTree.
+ */
+#ifndef __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_IMPL_HPP
+#define __MLPACK_CORE_TREE_RECTANGLE_TREE_X_TREE_SPLIT_IMPL_HPP
+
+#include "x_tree_split.hpp"
+#include "rectangle_tree.hpp"
+#include <mlpack/core/math/range.hpp>
+
+namespace mlpack {
+namespace tree {
+
+/**
+ * We call GetPointSeeds to get the two points which will be the initial points in the new nodes
+ * We then call AssignPointDestNode to assign the remaining points to the two new nodes.
+ * Finally, we delete the old node and insert the new nodes into the tree, spliting the parent
+ * if necessary.
+ */
+template<typename DescentType,
+typename StatisticType,
+typename MatType>
+void XTreeSplit<DescentType, StatisticType, MatType>::SplitLeafNode(
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree,
+ std::vector<bool>& relevels)
+{
+ // If we are splitting the root node, we need will do things differently so that the constructor
+ // and other methods don't confuse the end user by giving an address of another node.
+ if (tree->Parent() == NULL) {
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* copy =
+ new RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(*tree, false); // We actually want to copy this way. Pointers and everything.
+
+ copy->Parent() = tree;
+ tree->Count() = 0;
+ tree->NullifyData();
+ tree->Children()[(tree->NumChildren())++] = copy; // Because this was a leaf node, numChildren must be 0.
+ assert(tree->NumChildren() == 1);
+ XTreeSplit<DescentType, StatisticType, MatType>::SplitLeafNode(copy, relevels);
+ return;
+ }
+
+ // If we haven't yet reinserted on this level, we try doing so now.
+ if(relevels[tree->TreeDepth()]) {
+ relevels[tree->TreeDepth()] = false;
+ // We sort the points by decreasing distance to the centroid of the bound.
+ // We then remove the first p entries and reinsert them at the root.
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* root = tree;
+ while(root->Parent() != NULL)
+ root = root->Parent();
+ size_t p = tree->MaxLeafSize() * 0.3; // The R*-tree paper says this works the best.
+ if(p == 0) {
+ SplitLeafNode(tree, relevels);
+ return;
+ }
+
+ std::vector<sortStruct> sorted(tree->Count());
+ arma::vec centroid;
+ tree->Bound().Centroid(centroid); // Modifies centroid.
+ for(size_t i = 0; i < sorted.size(); i++) {
+ sorted[i].d = tree->Bound().Metric().Evaluate(centroid, tree->LocalDataset().col(i));
+ sorted[i].n = i;
+ }
+
+ std::sort(sorted.begin(), sorted.end(), structComp);
+ std::vector<int> pointIndices(p);
+ for(size_t i = 0; i < p; i++) {
+ pointIndices[i] = tree->Points()[sorted[sorted.size()-1-i].n]; // We start from the end of sorted.
+ root->DeletePoint(tree->Points()[sorted[sorted.size()-1-i].n], relevels);
+ }
+ for(size_t i = 0; i < p; i++) { // We reverse the order again to reinsert the closest points first.
+ root->InsertPoint(pointIndices[p-1-i], relevels);
+ }
+
+// // If we went below min fill, delete this node and reinsert all points.
+// if(tree->Count() < tree->MinLeafSize()) {
+// std::vector<int> pointIndices(tree->Count());
+// for(size_t i = 0; i < tree->Count(); i++) {
+// pointIndices[i] = tree->Points()[i];
+// }
+// root->RemoveNode(tree, relevels);
+// for(size_t i = 0; i < pointIndices.size(); i++) {
+// root->InsertPoint(pointIndices[i], relevels);
+// }
+// //tree->SoftDelete();
+// }
+ return;
+ }
+
+ int bestOverlapIndexOnBestAxis = 0;
+ int bestAreaIndexOnBestAxis = 0;
+ bool tiedOnOverlap = false;
+ int bestAxis = 0;
+ double bestAxisScore = DBL_MAX;
+ for (int j = 0; j < tree->Bound().Dim(); j++) {
+ double axisScore = 0.0;
+ // Since we only have points in the leaf nodes, we only need to sort once.
+ std::vector<sortStruct> sorted(tree->Count());
+ for (int i = 0; i < sorted.size(); i++) {
+ sorted[i].d = tree->LocalDataset().col(i)[j];
+ sorted[i].n = i;
+ }
+
+ std::sort(sorted.begin(), sorted.end(), structComp);
+
+ // We'll store each of the three scores for each distribution.
+ std::vector<double> areas(tree->MaxLeafSize() - 2 * tree->MinLeafSize() + 2);
+ std::vector<double> margins(tree->MaxLeafSize() - 2 * tree->MinLeafSize() + 2);
+ std::vector<double> overlapedAreas(tree->MaxLeafSize() - 2 * tree->MinLeafSize() + 2);
+ for (int i = 0; i < areas.size(); i++) {
+ areas[i] = 0.0;
+ margins[i] = 0.0;
+ overlapedAreas[i] = 0.0;
+ }
+ for (int i = 0; i < areas.size(); i++) {
+ // The ith arrangement is obtained by placing the first tree->MinLeafSize() + i
+ // points in one rectangle and the rest in another. Then we calculate the three
+ // scores for that distribution.
+
+ int cutOff = tree->MinLeafSize() + i;
+ // We'll calculate the max and min in each dimension by hand to save time.
+ std::vector<double> maxG1(tree->Bound().Dim());
+ std::vector<double> minG1(maxG1.size());
+ std::vector<double> maxG2(maxG1.size());
+ std::vector<double> minG2(maxG1.size());
+ for (int k = 0; k < tree->Bound().Dim(); k++) {
+ minG1[k] = maxG1[k] = tree->LocalDataset().col(sorted[0].n)[k];
+ minG2[k] = maxG2[k] = tree->LocalDataset().col(sorted[sorted.size() - 1].n)[k];
+ for (int l = 1; l < tree->Count() - 1; l++) {
+ if (l < cutOff) {
+ if (tree->LocalDataset().col(sorted[l].n)[k] < minG1[k])
+ minG1[k] = tree->LocalDataset().col(sorted[l].n)[k];
+ else if (tree->LocalDataset().col(sorted[l].n)[k] > maxG1[k])
+ maxG1[k] = tree->LocalDataset().col(sorted[l].n)[k];
+ } else {
+ if (tree->LocalDataset().col(sorted[l].n)[k] < minG2[k])
+ minG2[k] = tree->LocalDataset().col(sorted[l].n)[k];
+ else if (tree->LocalDataset().col(sorted[l].n)[k] > maxG2[k])
+ maxG2[k] = tree->LocalDataset().col(sorted[l].n)[k];
+ }
+ }
+ }
+ double area1 = 1.0, area2 = 1.0;
+ double oArea = 1.0;
+ for (int k = 0; k < maxG1.size(); k++) {
+ margins[i] += maxG1[k] - minG1[k] + maxG2[k] - minG2[k];
+ area1 *= maxG1[k] - minG1[k];
+ area2 *= maxG2[k] - minG2[k];
+ oArea *= maxG1[k] < minG2[k] || maxG2[k] < minG1[k] ? 0.0 : std::min(maxG1[k], maxG2[k]) - std::max(minG1[k], minG2[k]);
+ }
+ areas[i] += area1 + area2;
+ overlapedAreas[i] += oArea;
+ axisScore += margins[i];
+ }
+
+ if (axisScore < bestAxisScore) {
+ bestAxisScore = axisScore;
+ bestAxis = j;
+ bestOverlapIndexOnBestAxis = 0;
+ bestAreaIndexOnBestAxis = 0;
+ for (int i = 1; i < areas.size(); i++) {
+ if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis]) {
+ tiedOnOverlap = false;
+ bestAreaIndexOnBestAxis = i;
+ bestOverlapIndexOnBestAxis = i;
+ } else if (overlapedAreas[i] == overlapedAreas[bestOverlapIndexOnBestAxis]) {
+ tiedOnOverlap = true;
+ if (areas[i] < areas[bestAreaIndexOnBestAxis])
+ bestAreaIndexOnBestAxis = i;
+ }
+ }
+ }
+ }
+
+ std::vector<sortStruct> sorted(tree->Count());
+ for (int i = 0; i < sorted.size(); i++) {
+ sorted[i].d = tree->LocalDataset().col(i)[bestAxis];
+ sorted[i].n = i;
+ }
+
+ std::sort(sorted.begin(), sorted.end(), structComp);
+
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeOne = new
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeTwo = new
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+
+ // The leaf nodes should never have any overlap introduced by the above method
+ // since a split axis is chosen and then points are assigned based on their value
+ // along that axis.
+ if (tiedOnOverlap) {
+ for (int i = 0; i < tree->Count(); i++) {
+ if (i < bestAreaIndexOnBestAxis + tree->MinLeafSize())
+ treeOne->InsertPoint(tree->Points()[sorted[i].n]);
+ else
+ treeTwo->InsertPoint(tree->Points()[sorted[i].n]);
+ }
+ } else {
+ for (int i = 0; i < tree->Count(); i++) {
+ if (i < bestOverlapIndexOnBestAxis + tree->MinLeafSize())
+ treeOne->InsertPoint(tree->Points()[sorted[i].n]);
+ else
+ treeTwo->InsertPoint(tree->Points()[sorted[i].n]);
+ }
+ }
+
+ //Remove this node and insert treeOne and treeTwo
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* par = tree->Parent();
+ int index = -1;
+ for (int i = 0; i < par->NumChildren(); i++) {
+ if (par->Children()[i] == tree) {
+ index = i;
+ break;
+ }
+ }
+ assert(index != -1);
+ par->Children()[index] = treeOne;
+ par->Children()[par->NumChildren()++] = treeTwo;
+
+ //We now update the split history of each new node.
+ treeOne->SplitHistory().history[bestAxis] = true;
+ treeOne->SplitHistory().lastDimension = bestAxis;
+ treeTwo->SplitHistory().history[bestAxis] = true;
+ treeTwo->SplitHistory().lastDimension = bestAxis;
+
+ // we only add one at a time, so we should only need to test for equality
+ // just in case, we use an assert.
+ assert(par->NumChildren() <= par->MaxNumChildren()+1);
+ if (par->NumChildren() == par->MaxNumChildren()+1) {
+ SplitNonLeafNode(par, relevels);
+ }
+
+ assert(treeOne->Parent()->NumChildren() <= treeOne->MaxNumChildren());
+ assert(treeOne->Parent()->NumChildren() >= treeOne->MinNumChildren());
+ assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
+ assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
+
+ tree->SoftDelete();
+
+ return;
+}
+
+/**
+ * We call GetBoundSeeds to get the two new nodes that this one will be broken
+ * into. Then we call AssignNodeDestNode to move the children of this node
+ * into either of those two nodes. Finally, we delete the now unused information
+ * and recurse up the tree if necessary. We don't need to worry about the bounds
+ * higher up the tree because they were already updated if necessary.
+ */
+template<typename DescentType,
+typename StatisticType,
+typename MatType>
+bool XTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* tree,
+ std::vector<bool>& relevels)
+{
+ // If we are splitting the root node, we need will do things differently so that the constructor
+ // and other methods don't confuse the end user by giving an address of another node.
+ if (tree->Parent() == NULL) {
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* copy =
+ new RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(*tree, false); // We actually want to copy this way. Pointers and everything.
+
+ copy->Parent() = tree;
+ tree->NumChildren() = 0;
+ tree->NullifyData();
+ tree->Children()[(tree->NumChildren())++] = copy;
+ XTreeSplit<DescentType, StatisticType, MatType>::SplitNonLeafNode(copy, relevels);
+ return true;
+ }
+
+ // The X tree paper doesn't explain how to handle the split history when reinserting nodes
+ // and reinserting nodes seems to hurt the performance, so we don't do it.
+
+
+ // We find the split axis that will be used if the topological split fails now
+ // to save CPU time.
+
+ // Find the next split axis.
+ std::vector<bool> axes(tree->Bound().Dim());
+ std::vector<int> dimensionsLastUsed(tree->NumChildren());
+ for(size_t i = 0; i < tree->NumChildren(); i++)
+ dimensionsLastUsed[i] = tree->Child(i).SplitHistory().lastDimension;
+ std::sort(dimensionsLastUsed.begin(), dimensionsLastUsed.end());
+
+ int lastDim = dimensionsLastUsed[dimensionsLastUsed.size()/2];
+ int minOverlapSplitDimension = -1;
+
+ // See if we can use a new dimension.
+ for(size_t i = lastDim + 1; i < axes.size(); i++) {
+ axes[i] = true;
+ for(size_t j = 0; j < tree->NumChildren(); j++)
+ axes[i] = axes[i] & tree->Child(j).SplitHistory().history[i];
+ if(axes[i] == true) {
+ minOverlapSplitDimension = i;
+ break;
+ }
+ }
+ if(minOverlapSplitDimension == -1) {
+ for(size_t i = 0; i < lastDim + 1; i++) {
+ axes[i] = true;
+ for(size_t j = 0; j < tree->NumChildren(); j++)
+ axes[i] = axes[i] & tree->Child(j).SplitHistory().history[i];
+ if(axes[i] == true) {
+ minOverlapSplitDimension = i;
+ break;
+ }
+ }
+ }
+
+ bool minOverlapSplitUsesHi = false;
+ double bestScoreMinOverlapSplit = DBL_MAX;
+ double areaOfBestMinOverlapSplit = 0;
+ int bestIndexMinOverlapSplit = 0;
+
+
+ int bestOverlapIndexOnBestAxis = 0;
+ int bestAreaIndexOnBestAxis = 0;
+ bool tiedOnOverlap = false;
+ bool lowIsBest = true;
+ int bestAxis = 0;
+ double bestAxisScore = DBL_MAX;
+ double overlapBestOverlapAxis = 0;
+ double areaBestOverlapAxis = 0;
+ double overlapBestAreaAxis = 0;
+ double areaBestAreaAxis = 0;
+
+ for (int j = 0; j < tree->Bound().Dim(); j++) {
+ double axisScore = 0.0;
+
+ // We'll do Bound().Lo() now and use Bound().Hi() later.
+ std::vector<sortStruct> sorted(tree->NumChildren());
+ for (int i = 0; i < sorted.size(); i++) {
+ sorted[i].d = tree->Children()[i]->Bound()[j].Lo();
+ sorted[i].n = i;
+ }
+
+ std::sort(sorted.begin(), sorted.end(), structComp);
+
+ // We'll store each of the three scores for each distribution.
+ std::vector<double> areas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+ std::vector<double> margins(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+ std::vector<double> overlapedAreas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+ for (int i = 0; i < areas.size(); i++) {
+ areas[i] = 0.0;
+ margins[i] = 0.0;
+ overlapedAreas[i] = 0.0;
+ }
+
+ for (int i = 0; i < areas.size(); i++) {
+ // The ith arrangement is obtained by placing the first tree->MinNumChildren() + i
+ // points in one rectangle and the rest in another. Then we calculate the three
+ // scores for that distribution.
+
+ int cutOff = tree->MinNumChildren() + i;
+ // We'll calculate the max and min in each dimension by hand to save time.
+ std::vector<double> maxG1(tree->Bound().Dim());
+ std::vector<double> minG1(maxG1.size());
+ std::vector<double> maxG2(maxG1.size());
+ std::vector<double> minG2(maxG1.size());
+ for (int k = 0; k < tree->Bound().Dim(); k++) {
+ minG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Lo();
+ maxG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Hi();
+ minG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Lo();
+ maxG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Hi();
+ for (int l = 1; l < tree->NumChildren() - 1; l++) {
+ if (l < cutOff) {
+ if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG1[k])
+ minG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+ else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG1[k])
+ maxG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+ } else {
+ if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG2[k])
+ minG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+ else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG2[k])
+ maxG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+ }
+ }
+ }
+ double area1 = 1.0, area2 = 1.0;
+ double oArea = 1.0;
+ for (int k = 0; k < maxG1.size(); k++) {
+ margins[i] += maxG1[k] - minG1[k] + maxG2[k] - minG2[k];
+ area1 *= maxG1[k] - minG1[k];
+ area2 *= maxG2[k] - minG2[k];
+ oArea *= maxG1[k] < minG2[k] || maxG2[k] < minG1[k] ? 0.0 : std::min(maxG1[k], maxG2[k]) - std::max(minG1[k], minG2[k]);
+ }
+ areas[i] += area1 + area2;
+ overlapedAreas[i] += oArea;
+ axisScore += margins[i];
+ }
+ if (axisScore < bestAxisScore) {
+ bestAxisScore = axisScore;
+ bestAxis = j;
+ double bestOverlapIndexOnBestAxis = 0;
+ double bestAreaIndexOnBestAxis = 0;
+ for (int i = 1; i < areas.size(); i++) {
+ if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis]) {
+ tiedOnOverlap = false;
+ bestAreaIndexOnBestAxis = i;
+ bestOverlapIndexOnBestAxis = i;
+ overlapBestOverlapAxis = overlapedAreas[i];
+ areaBestOverlapAxis = areas[i];
+ } else if (overlapedAreas[i] == overlapedAreas[bestOverlapIndexOnBestAxis]) {
+ tiedOnOverlap = true;
+ if (areas[i] < areas[bestAreaIndexOnBestAxis]) {
+ bestAreaIndexOnBestAxis = i;
+ overlapBestAreaAxis = overlapedAreas[i];
+ areaBestAreaAxis = areas[i];
+ }
+ }
+ }
+ }
+
+ // Track the minOverlapSplit data
+ if(minOverlapSplitDimension != -1 && j == minOverlapSplitDimension) {
+ for(int i = 0; i < overlapedAreas.size(); i++) {
+ if(overlapedAreas[i] < bestScoreMinOverlapSplit) {
+ bestScoreMinOverlapSplit = overlapedAreas[i];
+ bestIndexMinOverlapSplit = i;
+ areaOfBestMinOverlapSplit = areas[i];
+ }
+ }
+ }
+ }
+
+ //Now we do the same thing using Bound().Hi() and choose the best of the two.
+ for (int j = 0; j < tree->Bound().Dim(); j++) {
+ double axisScore = 0.0;
+
+ // We'll do Bound().Lo() now and use Bound().Hi() later.
+ std::vector<sortStruct> sorted(tree->NumChildren());
+ for (int i = 0; i < sorted.size(); i++) {
+ sorted[i].d = tree->Children()[i]->Bound()[j].Hi();
+ sorted[i].n = i;
+ }
+
+ std::sort(sorted.begin(), sorted.end(), structComp);
+
+ // We'll store each of the three scores for each distribution.
+ std::vector<double> areas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+ std::vector<double> margins(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+ std::vector<double> overlapedAreas(tree->MaxNumChildren() - 2 * tree->MinNumChildren() + 2);
+ for (int i = 0; i < areas.size(); i++) {
+ areas[i] = 0.0;
+ margins[i] = 0.0;
+ overlapedAreas[i] = 0.0;
+ }
+
+ for (int i = 0; i < areas.size(); i++) {
+ // The ith arrangement is obtained by placing the first tree->MinNumChildren() + i
+ // points in one rectangle and the rest in another. Then we calculate the three
+ // scores for that distribution.
+
+ int cutOff = tree->MinNumChildren() + i;
+ // We'll calculate the max and min in each dimension by hand to save time.
+ std::vector<double> maxG1(tree->Bound().Dim());
+ std::vector<double> minG1(maxG1.size());
+ std::vector<double> maxG2(maxG1.size());
+ std::vector<double> minG2(maxG1.size());
+ for (int k = 0; k < tree->Bound().Dim(); k++) {
+ minG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Lo();
+ maxG1[k] = tree->Children()[sorted[0].n]->Bound()[k].Hi();
+ minG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Lo();
+ maxG2[k] = tree->Children()[sorted[sorted.size() - 1].n]->Bound()[k].Hi();
+ for (int l = 1; l < tree->NumChildren() - 1; l++) {
+ if (l < cutOff) {
+ if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG1[k])
+ minG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+ else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG1[k])
+ maxG1[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+ } else {
+ if (tree->Children()[sorted[l].n]->Bound()[k].Lo() < minG2[k])
+ minG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Lo();
+ else if (tree->Children()[sorted[l].n]->Bound()[k].Hi() > maxG2[k])
+ maxG2[k] = tree->Children()[sorted[l].n]->Bound()[k].Hi();
+ }
+ }
+ }
+ double area1 = 1.0, area2 = 1.0;
+ double oArea = 1.0;
+ for (int k = 0; k < maxG1.size(); k++) {
+ margins[i] += maxG1[k] - minG1[k] + maxG2[k] - minG2[k];
+ area1 *= maxG1[k] - minG1[k];
+ area2 *= maxG2[k] - minG2[k];
+ oArea *= maxG1[k] < minG2[k] || maxG2[k] < minG1[k] ? 0.0 : std::min(maxG1[k], maxG2[k]) - std::max(minG1[k], minG2[k]);
+ }
+ areas[i] += area1 + area2;
+ overlapedAreas[i] += oArea;
+ axisScore += margins[i];
+ }
+ if (axisScore < bestAxisScore) {
+ bestAxisScore = axisScore;
+ bestAxis = j;
+ lowIsBest = false;
+ double bestOverlapIndexOnBestAxis = 0;
+ double bestAreaIndexOnBestAxis = 0;
+ for (int i = 1; i < areas.size(); i++) {
+ if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis]) {
+ tiedOnOverlap = false;
+ bestAreaIndexOnBestAxis = i;
+ bestOverlapIndexOnBestAxis = i;
+ overlapBestOverlapAxis = overlapedAreas[i];
+ areaBestOverlapAxis = areas[i];
+ } else if (overlapedAreas[i] == overlapedAreas[bestOverlapIndexOnBestAxis]) {
+ tiedOnOverlap = true;
+ if (areas[i] < areas[bestAreaIndexOnBestAxis]) {
+ bestAreaIndexOnBestAxis = i;
+ overlapBestAreaAxis = overlapedAreas[i];
+ areaBestAreaAxis = areas[i];
+ }
+ }
+ }
+ }
+
+ // Track the minOverlapSplit data
+ if(minOverlapSplitDimension != -1 && j == minOverlapSplitDimension) {
+ for(int i = 0; i < overlapedAreas.size(); i++) {
+ if(overlapedAreas[i] < bestScoreMinOverlapSplit) {
+ minOverlapSplitUsesHi = true;
+ bestScoreMinOverlapSplit = overlapedAreas[i];
+ bestIndexMinOverlapSplit = i;
+ areaOfBestMinOverlapSplit = areas[i];
+ }
+ }
+ }
+ }
+
+ std::vector<sortStruct> sorted(tree->NumChildren());
+ if (lowIsBest) {
+ for (int i = 0; i < sorted.size(); i++) {
+ sorted[i].d = tree->Children()[i]->Bound()[bestAxis].Lo();
+ sorted[i].n = i;
+ }
+ } else {
+ for (int i = 0; i < sorted.size(); i++) {
+ sorted[i].d = tree->Children()[i]->Bound()[bestAxis].Hi();
+ sorted[i].n = i;
+ }
+ }
+
+ std::sort(sorted.begin(), sorted.end(), structComp);
+
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeOne = new
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType> *treeTwo = new
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>(tree->Parent());
+
+ // Now as per the X-tree paper, we ensure that this split was good enough.
+ bool useMinOverlapSplit = false;
+ if (tiedOnOverlap) {
+ if(overlapBestAreaAxis/areaBestAreaAxis < MAX_OVERLAP) {
+ for (int i = 0; i < tree->NumChildren(); i++) {
+ if (i < bestAreaIndexOnBestAxis + tree->MinNumChildren())
+ InsertNodeIntoTree(treeOne, tree->Children()[sorted[i].n]);
+ else
+ InsertNodeIntoTree(treeTwo, tree->Children()[sorted[i].n]);
+ }
+ } else
+ useMinOverlapSplit = true;
+ } else {
+ if(overlapBestOverlapAxis/areaBestOverlapAxis < MAX_OVERLAP) {
+ for (int i = 0; i < tree->NumChildren(); i++) {
+ if (i < bestOverlapIndexOnBestAxis + tree->MinNumChildren())
+ InsertNodeIntoTree(treeOne, tree->Children()[sorted[i].n]);
+ else
+ InsertNodeIntoTree(treeTwo, tree->Children()[sorted[i].n]);
+ }
+ } else
+ useMinOverlapSplit = true;
+ }
+
+ // If the split was not good enough, then we try the minimal overlap split. If that fails, we create
+ // a "super node" (more accurately we resize this one to make it a super node).
+ if(useMinOverlapSplit) {
+ // If there is a dimension that might work, try that.
+ if(minOverlapSplitDimension != -1 && bestScoreMinOverlapSplit / areaOfBestMinOverlapSplit < MAX_OVERLAP) {
+ std::vector<sortStruct> sorted2(tree->NumChildren());
+ if (minOverlapSplitUsesHi) {
+ for (int i = 0; i < sorted2.size(); i++) {
+ sorted2[i].d = tree->Children()[i]->Bound()[bestAxis].Hi();
+ sorted2[i].n = i;
+ }
+ } else {
+ for (int i = 0; i < sorted2.size(); i++) {
+ sorted2[i].d = tree->Children()[i]->Bound()[bestAxis].Lo();
+ sorted2[i].n = i;
+ }
+ }
+ std::sort(sorted2.begin(), sorted2.end(), structComp);
+ for (int i = 0; i < tree->NumChildren(); i++) {
+ if (i < bestIndexMinOverlapSplit + tree->MinNumChildren())
+ InsertNodeIntoTree(treeOne, tree->Children()[sorted[i].n]);
+ else
+ InsertNodeIntoTree(treeTwo, tree->Children()[sorted[i].n]);
+ }
+ } else {
+
+
+
+
+
+ // The min overlap split failed so we create a supernode instead.
+ tree->MaxNumChildren() *= 2;
+ tree->MaxLeafSize() *= 2;
+ tree->LocalDataset().resize(tree->LocalDataset().n_rows, 2*tree->LocalDataset().n_cols);
+ tree->Children().resize(tree->MaxNumChildren()+1);
+ tree->Points().resize(tree->MaxLeafSize()+1);
+
+ return false;
+
+ }
+ }
+
+ // Update the split history of each child.
+
+ treeOne->SplitHistory().history[bestAxis] = true;
+ treeOne->SplitHistory().lastDimension = bestAxis;
+ treeTwo->SplitHistory().history[bestAxis] = true;
+ treeTwo->SplitHistory().lastDimension = bestAxis;
+
+
+
+
+
+
+
+
+ //Remove this node and insert treeOne and treeTwo
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* par = tree->Parent();
+ int index = 0;
+ for (int i = 0; i < par->NumChildren(); i++) {
+ if (par->Children()[i] == tree) {
+ index = i;
+ break;
+ }
+ }
+ par->Children()[index] = treeOne;
+ par->Children()[par->NumChildren()++] = treeTwo;
+
+ // we only add one at a time, so we should only need to test for equality
+ // just in case, we use an assert.
+ assert(par->NumChildren() <= par->MaxNumChildren()+1);
+ if (par->NumChildren() == par->MaxNumChildren()+1) {
+ SplitNonLeafNode(par, relevels);
+ }
+
+ // We have to update the children of each of these new nodes so that they record the
+ // correct parent.
+ for (int i = 0; i < treeOne->NumChildren(); i++) {
+ treeOne->Children()[i]->Parent() = treeOne;
+ }
+ for (int i = 0; i < treeTwo->NumChildren(); i++) {
+ treeTwo->Children()[i]->Parent() = treeTwo;
+ }
+
+
+ assert(treeOne->Parent()->NumChildren() <= treeOne->MaxNumChildren());
+ assert(treeOne->Parent()->NumChildren() >= treeOne->MinNumChildren());
+ assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
+ assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
+
+ assert(treeOne->MaxNumChildren() < 7);
+ assert(treeTwo->MaxNumChildren() < 7);
+
+ tree->SoftDelete();
+
+ return false;
+}
+
+/**
+ * Insert a node into another node. Expanding the bounds and updating the numberOfChildren.
+ */
+template<typename DescentType,
+typename StatisticType,
+typename MatType>
+void XTreeSplit<DescentType, StatisticType, MatType>::InsertNodeIntoTree(
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* destTree,
+ RectangleTree<XTreeSplit<DescentType, StatisticType, MatType>, DescentType, StatisticType, MatType>* srcNode)
+{
+ destTree->Bound() |= srcNode->Bound();
+ destTree->Children()[destTree->NumChildren()++] = srcNode;
+}
+
+}; // namespace tree
+}; // namespace mlpack
+
+#endif
diff --git a/src/mlpack/tests/rectangle_tree_test.cpp b/src/mlpack/tests/rectangle_tree_test.cpp
index e8a3223..d1210db 100644
--- a/src/mlpack/tests/rectangle_tree_test.cpp
+++ b/src/mlpack/tests/rectangle_tree_test.cpp
@@ -610,6 +610,99 @@ BOOST_AUTO_TEST_CASE(SingleTreeTraverserTest) {
}
}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*
+
+
+// A test to ensure that the SingleTreeTraverser is working correctly by comparing
+// its results to the results of a naive search.
+BOOST_AUTO_TEST_CASE(XTreeTraverserTest) {
+ arma::mat dataset;
+ dataset.randu(8, 1000); // 1000 points in 8 dimensions.
+ arma::Mat<size_t> neighbors1;
+ arma::mat distances1;
+ arma::Mat<size_t> neighbors2;
+ arma::mat distances2;
+
+ RectangleTree<tree::XTreeSplit<tree::RStarTreeDescentHeuristic, NeighborSearchStat<NearestNeighborSort>, arma::mat>,
+ tree::RStarTreeDescentHeuristic,
+ NeighborSearchStat<NearestNeighborSort>,
+ arma::mat> RTree(dataset, 20, 6, 5, 2, 0);
+
+ // nearest neighbor search with the R tree.
+ mlpack::neighbor::NeighborSearch<NearestNeighborSort, metric::LMetric<2, true>,
+ RectangleTree<tree::XTreeSplit<tree::RStarTreeDescentHeuristic, NeighborSearchStat<NearestNeighborSort>, arma::mat>,
+ tree::RStarTreeDescentHeuristic,
+ NeighborSearchStat<NearestNeighborSort>,
+ arma::mat> > allknn1(&RTree,
+ dataset, true);
+
+ BOOST_REQUIRE_EQUAL(RTree.NumDescendants(), 1000);
+// checkSync(RTree);
+// checkContainment(RTree);
+// checkExactContainment(RTree);
+
+ allknn1.Search(5, neighbors1, distances1);
+
+ // nearest neighbor search the naive way.
+ mlpack::neighbor::AllkNN allknn2(dataset,
+ true, true);
+
+ allknn2.Search(5, neighbors2, distances2);
+
+ for (size_t i = 0; i < neighbors1.size(); i++) {
+ BOOST_REQUIRE_EQUAL(neighbors1[i], neighbors2[i]);
+ BOOST_REQUIRE_EQUAL(distances1[i], distances2[i]);
+ }
+}
+
+
+
+
+
+*/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
// Test the tree splitting. We set MaxLeafSize and MaxNumChildren rather low
// to allow us to test by hand without adding hundreds of points.
BOOST_AUTO_TEST_CASE(RTreeSplitTest) {
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list