[mlpack] 10/20: Significant refactoring for clarity.
Barak A. Pearlmutter
barak+git at pearlmutter.net
Thu May 25 20:44:09 UTC 2017
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch master
in repository mlpack.
commit 2c4c1ea9c7fae0743b3d13153ce9ad9e41b27989
Author: Ryan Curtin <ryan at ratml.org>
Date: Wed May 3 17:19:16 2017 -0400
Significant refactoring for clarity.
---
.../tree/rectangle_tree/r_star_tree_split_impl.hpp | 848 ++++++---------------
1 file changed, 253 insertions(+), 595 deletions(-)
diff --git a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
index 51ddfea..3813936 100644
--- a/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
+++ b/src/mlpack/core/tree/rectangle_tree/r_star_tree_split_impl.hpp
@@ -33,30 +33,12 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
typedef typename TreeType::ElemType ElemType;
typedef bound::HRectBound<metric::EuclideanDistance, ElemType> BoundType;
+ // If there's no need to split, don't.
if (tree->Count() <= tree->MaxLeafSize())
return;
-// std::cout << "split leaf node " << tree << " with parent " << tree->Parent()
-//<< "\n";
-
- // If we are splitting the root node, we need will do things differently so
- // that the constructor and other methods don't confuse the end user by giving
- // an address of another node.
-// if (tree->Parent() == NULL)
-// {
- // We actually want to copy this way. Pointers and everything.
-// TreeType* copy = new TreeType(*tree, false);
-// copy->Parent() = tree;
-// tree->Count() = 0;
-// tree->NullifyData();
- // Because this was a leaf node, numChildren must be 0.
-// tree->children[(tree->NumChildren())++] = copy;
-// assert(tree->NumChildren() == 1);
-
-// RStarTreeSplit::SplitLeafNode(copy,relevels);
-// return;
-// }
-
- // If we haven't yet reinserted on this level, we try doing so now.
+
+ // If we haven't yet checked if we need to reinsert on this level, we try
+ // doing so now.
if (relevels[tree->TreeDepth() - 1])
{
relevels[tree->TreeDepth() - 1] = false;
@@ -67,132 +49,121 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
while (root->Parent() != NULL)
root = root->Parent();
size_t p = tree->MaxLeafSize() * 0.3; // The paper says this works the best.
- if (p == 0)
- {
- RStarTreeSplit::SplitLeafNode(tree,relevels);
- return;
- }
-
- std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
- arma::Col<ElemType> center;
- tree->Bound().Center(center); // Modifies centroid.
- for (size_t i = 0; i < sorted.size(); i++)
+ if (p > 0)
{
- sorted[i].first = tree->Metric().Evaluate(center,
- tree->Dataset().col(tree->Point(i)));
- sorted[i].second = i;
- }
+ // We'll only do reinsertions if p > 0. p is the number of points we will
+ // reinsert. If p == 0, then no reinsertion is necessary and we continue
+ // with the splitting procedure.
+ std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
+ arma::Col<ElemType> center;
+ tree->Bound().Center(center);
+ for (size_t i = 0; i < sorted.size(); i++)
+ {
+ sorted[i].first = tree->Metric().Evaluate(center,
+ tree->dataset->col(tree->Point(i)));
+ sorted[i].second = tree->Point(i);
+ }
- std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
- std::vector<size_t> pointIndices(sorted.size());
+ std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
- for (size_t i = 0; i < p; i++)
- {
- // We start from the end of sorted.
- pointIndices[i] = tree->Point(sorted[sorted.size() - 1 - i].second);
+ // Remove the points furthest from the center of the node.
+ for (size_t i = 0; i < p; i++)
+ root->DeletePoint(sorted[sorted.size() - 1 - i].second, relevels);
- root->DeletePoint(tree->Point(sorted[sorted.size() - 1 - i].second),
- relevels);
- }
+ // Now reinsert the points, but reverse the order---insert the closest to
+ // the center first.
+ for (size_t i = p; i > 0; --i)
+ root->InsertPoint(sorted[sorted.size() - i].second, relevels);
- for (size_t i = 0; i < p; i++)
- {
- // We reverse the order again to reinsert the closest points first.
- root->InsertPoint(pointIndices[p - 1 - i], relevels);
+ // Any reinsertions take care of splitting.
+ return;
}
-
- return;
}
- int bestOverlapIndexOnBestAxis = 0;
- int bestAreaIndexOnBestAxis = 0;
- bool tiedOnOverlap = false;
- int bestAxis = 0;
- ElemType bestAxisScore = std::numeric_limits<ElemType>::max();
+ // We don't need to reinsert. Instead, we need to split the node.
+ size_t bestAxis = 0;
+ size_t bestSplitIndex = 0;
+ ElemType bestScore = std::numeric_limits<ElemType>::max();
+ /**
+ * Check each dimension, to find which dimension is best to split on.
+ */
for (size_t j = 0; j < tree->Bound().Dim(); j++)
{
ElemType axisScore = 0.0;
- // Since we only have points in the leaf nodes, we only need to sort once.
- std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
- for (size_t i = 0; i < sorted.size(); i++)
- {
- sorted[i].first = tree->Dataset().col(tree->Point(i))[j];
- sorted[i].second = i;
- }
- std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
+ // Sort in increasing values of the selected dimension j.
+ arma::Col<ElemType> dimValues(tree->Count());
+ for (size_t i = 0; i < tree->Count(); ++i)
+ dimValues[i] = tree->Dataset().col(tree->Point(i))[j];
+ arma::uvec sortedIndices = arma::sort_index(dimValues);
// We'll store each of the three scores for each distribution.
- std::vector<ElemType> areas(tree->MaxLeafSize() - 2 * tree->MinLeafSize() +
- 2);
- std::vector<ElemType> margins(tree->MaxLeafSize() - 2 * tree->MinLeafSize() +
- 2);
- std::vector<ElemType> overlapedAreas(tree->MaxLeafSize() -
- 2 * tree->MinLeafSize() + 2);
-
- for (size_t i = 0; i < areas.size(); i++)
- {
- areas[i] = 0.0;
- margins[i] = 0.0;
- overlapedAreas[i] = 0.0;
- }
+ const size_t numPossibleSplits = tree->MaxLeafSize() -
+ 2 * tree->MinLeafSize() + 2;
+ arma::Col<ElemType> areas(numPossibleSplits, arma::fill::zeros);
+ arma::Col<ElemType> margins(numPossibleSplits, arma::fill::zeros);
+ arma::Col<ElemType> overlaps(numPossibleSplits, arma::fill::zeros);
- for (size_t i = 0; i < areas.size(); i++)
+ for (size_t i = 0; i < numPossibleSplits; i++)
{
// The ith arrangement is obtained by placing the first
// tree->MinLeafSize() + i points in one rectangle and the rest in
// another. Then we calculate the three scores for that distribution.
-
- size_t cutOff = tree->MinLeafSize() + i;
+ size_t splitIndex = tree->MinLeafSize() + i;
BoundType bound1(tree->Bound().Dim());
BoundType bound2(tree->Bound().Dim());
- for (size_t l = 0; l < cutOff; l++)
- bound1 |= tree->Dataset().col(tree->Point(sorted[l].second));
+ for (size_t l = 0; l < splitIndex; l++)
+ bound1 |= tree->Dataset().col(tree->Point(sortedIndices[l]));
- for (size_t l = cutOff; l < tree->Count(); l++)
- bound2 |= tree->Dataset().col(tree->Point(sorted[l].second));
+ for (size_t l = splitIndex; l < tree->Count(); l++)
+ bound2 |= tree->Dataset().col(tree->Point(sortedIndices[l]));
- ElemType area1 = bound1.Volume();
- ElemType area2 = bound2.Volume();
- ElemType oArea = bound1.Overlap(bound2);
+ areas[i] = bound1.Volume() + bound2.Volume();
+ overlaps[i] = bound1.Overlap(bound2);
for (size_t k = 0; k < bound1.Dim(); k++)
margins[i] += bound1[k].Width() + bound2[k].Width();
- areas[i] += area1 + area2;
- overlapedAreas[i] += oArea;
axisScore += margins[i];
}
- if (axisScore < bestAxisScore)
+ // Is this dimension a new best score? We want the lowest possible score.
+ if (axisScore < bestScore)
{
- bestAxisScore = axisScore;
+ bestScore = axisScore;
bestAxis = j;
- bestOverlapIndexOnBestAxis = 0;
- bestAreaIndexOnBestAxis = 0;
+ size_t overlapIndex = 0;
+ size_t areaIndex = 0;
+ bool tiedOnOverlap = false;
- for (size_t i = 1; i < areas.size(); i++)
+ for (size_t i = 1; i < areas.n_elem; i++)
{
- if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis])
+ if (overlaps[i] < overlaps[overlapIndex])
{
tiedOnOverlap = false;
- bestAreaIndexOnBestAxis = i;
- bestOverlapIndexOnBestAxis = i;
+ overlapIndex = i;
+ areaIndex = i;
}
- else if (overlapedAreas[i] ==
- overlapedAreas[bestOverlapIndexOnBestAxis])
+ else if (overlaps[i] == overlaps[overlapIndex])
{
tiedOnOverlap = true;
- if (areas[i] < areas[bestAreaIndexOnBestAxis])
- bestAreaIndexOnBestAxis = i;
+ if (areas[i] < areas[areaIndex])
+ areaIndex = i;
}
}
+
+ // Select the best index for splitting.
+ bestSplitIndex = (tiedOnOverlap ? areaIndex : overlapIndex);
}
}
+ /**
+ * Now that we have found the best dimension to split on, re-sort in that
+ * dimension to prepare for reinsertion of points into the new nodes.
+ */
std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
for (size_t i = 0; i < sorted.size(); i++)
{
@@ -202,128 +173,57 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
- if (tree->Parent())
+ /**
+ * If 'tree' is the root of the tree (i.e. if it has no parent), then we must
+ * create two new child nodes, distribute the points from the original node
+ * among them, and insert those. If 'tree' is not the root of the tree, then
+ * we may create only one new child node, redistribute the points from the
+ * original node between 'tree' and the new node, then insert those nodes into
+ * the parent.
+ *
+ * Here we simply set treeOne and treeTwo to the right values to avoid code
+ * duplication.
+ */
+ TreeType* par = tree->Parent();
+ TreeType* treeOne = (par) ? tree : new TreeType(tree);
+ TreeType* treeTwo = (par) ? new TreeType(par) : new TreeType(tree);
+
+ // Now clean the node, and we will re-use this.
+ const size_t numPoints = tree->Count();
+
+ // Reset the original node's values, regardless of whether it will become
+ // the new parent or not.
+ tree->numChildren = 0;
+ tree->numDescendants = 0;
+ tree->count = 0;
+ tree->bound.Clear();
+
+ // Insert the points into the appropriate tree.
+ for (size_t i = 0; i < numPoints; i++)
{
-// std::cout << "first check parent " << tree->Parent() << " numDescendants "
-//<< tree->Parent()->NumDescendants() <<
-//" with " << tree->Parent()->NumChildren() << " children\n";
-// size_t manualCount = 0;
-// for (size_t i = 0; i < tree->Parent()->NumChildren(); ++i)
-// {
-// std::cout << "(" << &(tree->Parent()->Child(i)) << ") ";
-// manualCount += tree->Parent()->Child(i).NumDescendants();
-// std::cout << tree->Parent()->Child(i).NumDescendants() << " ";
-// }
-// std::cout << "\n";
- // TreeType* treeOne = new TreeType(tree->Parent());
- // Now clean the node, and we will re-use this.
- const size_t oldDescendants = tree->numDescendants;
- const size_t numPoints = tree->count;
- tree->numChildren = 0;
- tree->numDescendants = 0;
- tree->bound.Clear();
- tree->count = 0;
- tree->begin = 0;
-
- TreeType* treeTwo = new TreeType(tree->Parent());
-
- if (tiedOnOverlap)
- {
- for (size_t i = 0; i < numPoints; i++)
- {
- if (i < bestAreaIndexOnBestAxis + tree->MinLeafSize())
- tree->InsertPoint(sorted[i].second);
- else
- treeTwo->InsertPoint(sorted[i].second);
- }
- }
+ if (i < bestSplitIndex + tree->MinLeafSize())
+ treeOne->InsertPoint(sorted[i].second);
else
- {
- for (size_t i = 0; i < numPoints; i++)
- {
- if (i < bestOverlapIndexOnBestAxis + tree->MinLeafSize())
- tree->InsertPoint(sorted[i].second);
- else
- treeTwo->InsertPoint(sorted[i].second);
- }
- }
+ treeTwo->InsertPoint(sorted[i].second);
+ }
- // Insert the new tree node.
- TreeType* par = tree->Parent();
+ // Insert the new tree node(s).
+ if (par)
+ {
+ // Just insert the new node into the parent.
par->children[par->NumChildren()++] = treeTwo;
- // We only add one at a time, so we should only need to test for equality
- // just in case, we use an assert.
-// std::cout << "x: " << oldDescendants << " == " << tree->NumDescendants() <<
-//" + " << treeTwo->NumDescendants() << " (" << tree->NumDescendants() +
-//treeTwo->NumDescendants() << ")\n";
- assert(oldDescendants == tree->NumDescendants() + treeTwo->NumDescendants());
-// std::cout << "check parent " << par << " numDescendants " << par->NumDescendants() <<
-//" with " << par->NumChildren() << " children\n";
-// manualCount = 0;
-// for (size_t i = 0; i < par->NumChildren(); ++i)
-// {
-// std::cout << "(" << &(par->Child(i)) << ") ";
-// manualCount += par->Child(i).NumDescendants();
-// std::cout << par->Child(i).NumDescendants() << " ";
-// }
-// std::cout << "\n";
-// assert(par->NumDescendants() == manualCount);
- assert(par->NumChildren() <= par->MaxNumChildren() + 1);
+ // If we have overflowed the parent's children, then we need to split that
+ // node also.
if (par->NumChildren() == par->MaxNumChildren() + 1)
RStarTreeSplit::SplitNonLeafNode(par, relevels);
-
- assert(tree->Parent()->NumChildren() <= tree->MaxNumChildren());
- assert(tree->Parent()->NumChildren() >= tree->MinNumChildren());
- assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
- assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
}
else
{
- TreeType* treeOne = new TreeType(tree);
- TreeType* treeTwo = new TreeType(tree);
-
- const size_t oldNumDescendants = tree->NumDescendants();
- const size_t numPoints = tree->Count();
- tree->numChildren = 0;
- tree->bound.Clear();
- tree->count = 0;
- tree->begin = 0;
- tree->numDescendants = 0;
-
- if (tiedOnOverlap)
- {
- for (size_t i = 0; i < numPoints; ++i)
- {
- if (i < bestAreaIndexOnBestAxis + tree->MinLeafSize())
- treeOne->InsertPoint(sorted[i].second);
- else
- treeTwo->InsertPoint(sorted[i].second);
- }
- }
- else
- {
- for (size_t i = 0; i < numPoints; ++i)
- {
- if (i < bestOverlapIndexOnBestAxis + tree->MinLeafSize())
- treeOne->InsertPoint(sorted[i].second);
- else
- treeTwo->InsertPoint(sorted[i].second);
- }
- }
-
+ // Now insert the two nodes into 'tree', which is now a higher-level root
+ // node in the tree.
InsertNodeIntoTree(tree, treeOne);
InsertNodeIntoTree(tree, treeTwo);
-
-// std::cout << "y: " << oldNumDescendants << ": " << treeOne->NumDescendants()
-//<< " + " << treeTwo->NumDescendants() << " (" << tree->NumDescendants() <<
-//")\n";
-// std::cout << "tree left " << treeOne->NumDescendants() << ", right " <<
-//treeTwo->NumDescendants() << ", parent " << tree->NumDescendants() << "\n";
-// for (size_t i = 0; i < tree->NumChildren(); ++i)
-// std::cout << tree->Child(i).NumDescendants() << " ";
-// std::cout << " (that's the children of the parent)\n";
- assert(oldNumDescendants == tree->NumDescendants());
}
}
@@ -337,435 +237,205 @@ void RStarTreeSplit::SplitLeafNode(TreeType *tree,std::vector<bool>& relevels)
template<typename TreeType>
bool RStarTreeSplit::SplitNonLeafNode(TreeType *tree,std::vector<bool>& relevels)
{
-// std::cout << "split nonleaf node " << tree << " with parent " <<
-//tree->Parent() << "\n";
// Convenience typedef.
typedef typename TreeType::ElemType ElemType;
typedef bound::HRectBound<metric::EuclideanDistance, ElemType> BoundType;
- // If we are splitting the root node, we need will do things differently so
- // that the constructor and other methods don't confuse the end user by giving
- // an address of another node.
-// if (tree->Parent() == NULL)
-// {
- // We actually want to copy this way. Pointers and everything.
-// TreeType* copy = new TreeType(*tree, false);
-
-// copy->Parent() = tree;
-// tree->NumChildren() = 0;
-// tree->NullifyData();
-// tree->children[(tree->NumChildren())++] = copy;
-
-// RStarTreeSplit::SplitNonLeafNode(copy,relevels);
-// return true;
-// }
-
- /*
- // If we haven't yet reinserted on this level, we try doing so now.
- if (relevels[tree->TreeDepth()]) {
- relevels[tree->TreeDepth()] = false;
- // We sort the points by decreasing centroid to centroid distance.
- // We then remove the first p entries and reinsert them at the root.
- RectangleTree<RStarTreeSplit, DescentType, StatisticType, MatType>* root = tree;
- while(root->Parent() != NULL)
- root = root->Parent();
- size_t p = tree->MaxNumChildren() * 0.3; // The paper says this works the best.
- if (p == 0) {
- SplitNonLeafNode(tree, relevels);
- return false;
- }
+ // Reinsertion isn't done for non-leaf nodes; the paper doesn't seem to make
+ // it clear how to reinsert an entire node without reinserting each of the
+ // points, so we will avoid that here. This is a possible area for
+ // improvement of this code.
- std::vector<sortStruct> sorted(tree->NumChildren());
- arma::vec c1;
- tree->Bound().Center(c1); // Modifies c1.
- for(size_t i = 0; i < sorted.size(); i++) {
- arma::vec c2;
- tree->Children()[i]->Bound().Center(c2); // Modifies c2.
- sorted[i].d = tree->Bound().Metric().Evaluate(c1,c2);
- sorted[i].n = i;
- }
- std::sort(sorted.begin(), sorted.end(), structComp);
-
- //bool startBelowMinFill = tree->NumChildren() < tree->MinNumChildren();
+ size_t bestAxis = 0;
+ size_t bestIndex = 0;
+ ElemType bestScore = std::numeric_limits<ElemType>::max();
+ bool lowIsBetter = true;
- std::vector<RectangleTree<RStarTreeSplit, DescentType, StatisticType, MatType>*> removedNodes(p);
- for(size_t i =0; i < p; i++) {
- removedNodes[i] = tree->Children()[sorted[sorted.size()-1-i].n]; // We start from the end of sorted.
- root->RemoveNode(tree->Children()[sorted[sorted.size()-1-i].n], relevels);
- }
-
- for(size_t i = 0; i < p; i++) {
- root->InsertNode(removedNodes[p-1-i], tree->TreeDepth(), relevels); // We reverse the order again to reinsert the closest nodes first.
- }
-
- // If we went below min fill, delete this node and reinsert all children.
- //SOMETHING IS WRONG. SHOULD NOT GO BELOW MIN FILL.
-// if (!startBelowMinFill && tree->NumChildren() < tree->MinNumChildren())
-// std::cout<<"MINFILLERROR "<< p << ", " << tree->NumChildren() << "; " << tree->MaxNumChildren()<<std::endl;
-
-// if (tree->NumChildren() < tree->MinNumChildren()) {
-// std::vector<RectangleTree<RStarTreeSplit, DescentType, StatisticType, MatType>*> rmNodes(tree->NumChildren());
-// for(size_t i = 0; i < rmNodes.size(); i++) {
-// rmNodes[i] = tree->Children()[i];
-// }
-// root->RemoveNode(tree, relevels);
-// for(size_t i = 0; i < rmNodes.size(); i++) {
-// root->InsertNode(rmNodes[i], tree->TreeDepth(), relevels);
-// }
-// // tree->SoftDelete();
-// }
- // assert(tree->NumChildren() >= tree->MinNumChildren());
- // assert(tree->NumChildren() <= tree->MaxNumChildren());
-
- return false;
- }
-
-*/
-
- int bestOverlapIndexOnBestAxis = 0;
- int bestAreaIndexOnBestAxis = 0;
- bool tiedOnOverlap = false;
- bool lowIsBest = true;
- int bestAxis = 0;
- ElemType bestAxisScore = std::numeric_limits<ElemType>::max();
+ /**
+ * Check over each dimension to see which is best to use for splitting.
+ */
for (size_t j = 0; j < tree->Bound().Dim(); j++)
{
- ElemType axisScore = 0.0;
+ ElemType axisLoScore = 0.0;
+ ElemType axisHiScore = 0.0;
- // We'll do Bound().Lo() now and use Bound().Hi() later.
- std::vector<std::pair<ElemType, size_t>> sorted(tree->NumChildren());
- for (size_t i = 0; i < sorted.size(); i++)
- {
- sorted[i].first = tree->Child(i).Bound()[j].Lo();
- sorted[i].second = i;
- }
-
- std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
-
- // We'll store each of the three scores for each distribution.
- std::vector<ElemType> areas(tree->MaxNumChildren() -
- 2 * tree->MinNumChildren() + 2);
- std::vector<ElemType> margins(tree->MaxNumChildren() -
- 2 * tree->MinNumChildren() + 2);
- std::vector<ElemType> overlapedAreas(tree->MaxNumChildren() -
- 2 * tree->MinNumChildren() + 2);
-
- for (size_t i = 0; i < areas.size(); i++)
+ // We have to calculate values for both the lower and higher parts of the
+ // bound.
+ arma::Col<ElemType> loDimValues(tree->NumChildren());
+ arma::Col<ElemType> hiDimValues(tree->NumChildren());
+ for (size_t i = 0; i < tree->NumChildren(); i++)
{
- areas[i] = 0.0;
- margins[i] = 0.0;
- overlapedAreas[i] = 0.0;
+ loDimValues[i] = tree->Child(i).Bound()[j].Lo();
+ hiDimValues[i] = tree->Child(i).Bound()[j].Hi();
}
-
- for (size_t i = 0; i < areas.size(); i++)
+ arma::uvec sortedLoDimIndices = arma::sort_index(loDimValues);
+ arma::uvec sortedHiDimIndices = arma::sort_index(hiDimValues);
+
+ // We'll store each of the three scores for each distribution. Remember
+ // that these are the sums calculated over both the low and high bounds of
+ // each rectangle.
+ const size_t numPossibleSplits = tree->MaxNumChildren() -
+ 2 * tree->MinNumChildren() + 2;
+ arma::Col<ElemType> areas(2 * numPossibleSplits, arma::fill::zeros);
+ arma::Col<ElemType> margins(2 * numPossibleSplits, arma::fill::zeros);
+ arma::Col<ElemType> overlaps(2 * numPossibleSplits, arma::fill::zeros);
+
+ for (size_t i = 0; i < numPossibleSplits; ++i)
{
// The ith arrangement is obtained by placing the first
// tree->MinNumChildren() + i points in one rectangle and the rest in
// another. Then we calculate the three scores for that distribution.
+ const size_t splitIndex = tree->MinNumChildren() + i;
- size_t cutOff = tree->MinNumChildren() + i;
-
- BoundType bound1(tree->Bound().Dim());
- BoundType bound2(tree->Bound().Dim());
-
- for (size_t l = 0; l < cutOff; l++)
- bound1 |= tree->Child(sorted[l].second).Bound();
+ BoundType lb1(tree->Bound().Dim());
+ BoundType lb2(tree->Bound().Dim());
+ BoundType hb1(tree->Bound().Dim());
+ BoundType hb2(tree->Bound().Dim());
- for (size_t l = cutOff; l < tree->NumChildren(); l++)
- bound2 |= tree->Child(sorted[l].second).Bound();
-
- ElemType area1 = bound1.Volume();
- ElemType area2 = bound2.Volume();
- ElemType oArea = bound1.Overlap(bound2);
-
- for (size_t k = 0; k < bound1.Dim(); k++)
- margins[i] += bound1[k].Width() + bound2[k].Width();
-
- areas[i] += area1 + area2;
- overlapedAreas[i] += oArea;
- axisScore += margins[i];
- }
-
- if (axisScore < bestAxisScore)
- {
- bestAxisScore = axisScore;
- bestAxis = j;
- bestOverlapIndexOnBestAxis = 0;
- bestAreaIndexOnBestAxis = 0;
- for (size_t i = 1; i < areas.size(); i++)
+ for (size_t l = 0; l < splitIndex; ++l)
{
- if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis])
- {
- tiedOnOverlap = false;
- bestAreaIndexOnBestAxis = i;
- bestOverlapIndexOnBestAxis = i;
- }
- else if (overlapedAreas[i] ==
- overlapedAreas[bestOverlapIndexOnBestAxis])
- {
- tiedOnOverlap = true;
- if (areas[i] < areas[bestAreaIndexOnBestAxis])
- bestAreaIndexOnBestAxis = i;
- }
+ lb1 |= tree->Child(sortedLoDimIndices[l]).Bound();
+ hb1 |= tree->Child(sortedHiDimIndices[l]).Bound();
+ }
+ for (size_t l = splitIndex; l < tree->NumChildren(); ++l)
+ {
+ lb2 |= tree->Child(sortedLoDimIndices[l]).Bound();
+ hb2 |= tree->Child(sortedHiDimIndices[l]).Bound();
}
- }
- }
- // Now we do the same thing using Bound().Hi() and choose the best of the two.
- for (size_t j = 0; j < tree->Bound().Dim(); j++)
- {
- ElemType axisScore = 0.0;
+ // Calculate low bound distributions.
+ areas[2 * i] = lb1.Volume() + lb2.Volume();
+ overlaps[2 * i] = lb1.Overlap(lb2);
- std::vector<std::pair<ElemType, size_t>> sorted(tree->NumChildren());
- for (size_t i = 0; i < sorted.size(); i++)
- {
- sorted[i].first = tree->Child(i).Bound()[j].Hi();
- sorted[i].second = i;
- }
+ // Calculate high bound distributions.
+ areas[2 * i + 1] = hb1.Volume() + hb2.Volume();
+ overlaps[2 * i + 1] = hb1.Overlap(hb2);
- std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
+ // Now calculate margins for each.
+ for (size_t k = 0; k < lb1.Dim(); k++)
+ {
+ margins[2 * i] += lb1[k].Width() + lb2[k].Width();
+ margins[2 * i + 1] += hb1[k].Width() + hb2[k].Width();
+ }
- // We'll store each of the three scores for each distribution.
- std::vector<ElemType> areas(tree->MaxNumChildren() -
- 2 * tree->MinNumChildren() + 2);
- std::vector<ElemType> margins(tree->MaxNumChildren() -
- 2 * tree->MinNumChildren() + 2);
- std::vector<ElemType> overlapedAreas(tree->MaxNumChildren() -
- 2 * tree->MinNumChildren() + 2);
-
- for (size_t i = 0; i < areas.size(); i++)
- {
- areas[i] = 0.0;
- margins[i] = 0.0;
- overlapedAreas[i] = 0.0;
+ // The score we use is the sum of all scores.
+ axisLoScore += margins[2 * i];
+ axisHiScore += margins[2 * i + 1];
}
- for (size_t i = 0; i < areas.size(); i++)
+ // If this dimension's score (for lower or higher bound scores) is a new
+ // best, then extract the necessary split information.
+ if (std::min(axisLoScore, axisHiScore) < bestScore)
{
- // The ith arrangement is obtained by placing the first
- // tree->MinNumChildren() + i points in one rectangle and the rest in
- // another. Then we calculate the three scores for that distribution.
+ bestScore = std::min(axisLoScore, axisHiScore);
+ if (axisLoScore < axisHiScore)
+ lowIsBetter = true;
+ else
+ lowIsBetter = false;
- size_t cutOff = tree->MinNumChildren() + i;
-
- BoundType bound1(tree->Bound().Dim());
- BoundType bound2(tree->Bound().Dim());
-
- for (size_t l = 0; l < cutOff; l++)
- bound1 |= tree->Child(sorted[l].second).Bound();
-
- for (size_t l = cutOff; l < tree->NumChildren(); l++)
- bound2 |= tree->Child(sorted[l].second).Bound();
-
- ElemType area1 = bound1.Volume();
- ElemType area2 = bound2.Volume();
- ElemType oArea = bound1.Overlap(bound2);
-
- for (size_t k = 0; k < bound1.Dim(); k++)
- margins[i] += bound1[k].Width() + bound2[k].Width();
+ bestAxis = j;
- areas[i] += area1 + area2;
- overlapedAreas[i] += oArea;
- axisScore += margins[i];
- }
+ // This will get us either the lower or higher bound depending on which we
+ // want. Remember that we selected *either* the lower or higher bounds to
+ // split on, so we want to only check those.
+ const size_t indexOffset = lowIsBetter ? 0 : 1;
- if (axisScore < bestAxisScore)
- {
- bestAxisScore = axisScore;
- bestAxis = j;
- lowIsBest = false;
- bestOverlapIndexOnBestAxis = 0;
- bestAreaIndexOnBestAxis = 0;
+ size_t overlapIndex = indexOffset;
+ size_t areaIndex = indexOffset;
+ bool tiedOnOverlap = false;
- for (size_t i = 1; i < areas.size(); i++)
+ // Find the best possible split (and whether it is on the low values or
+ // high values of the bounds).
+ for (size_t i = 1; i < numPossibleSplits; i++)
{
- if (overlapedAreas[i] < overlapedAreas[bestOverlapIndexOnBestAxis])
+ // Check bounds.
+ if (overlaps[2 * i + indexOffset] < overlaps[overlapIndex])
{
tiedOnOverlap = false;
- bestAreaIndexOnBestAxis = i;
- bestOverlapIndexOnBestAxis = i;
+ areaIndex = 2 * i + indexOffset;
+ overlapIndex = 2 * i + indexOffset;
}
- else if (overlapedAreas[i] ==
- overlapedAreas[bestOverlapIndexOnBestAxis])
+ else if (overlaps[i] == overlaps[overlapIndex])
{
tiedOnOverlap = true;
- if (areas[i] < areas[bestAreaIndexOnBestAxis])
- bestAreaIndexOnBestAxis = i;
+ if (areas[2 * i + indexOffset] < areas[areaIndex])
+ areaIndex = 2 * i + indexOffset;
}
}
+
+ bestIndex = ((tiedOnOverlap ? areaIndex : overlapIndex) - indexOffset) / 2
+ + tree->MinNumChildren();
}
}
- std::vector<std::pair<ElemType, TreeType*>> sorted(tree->NumChildren());
- if (lowIsBest)
+ // Get a list of the old children.
+ std::vector<TreeType*> oldChildren(tree->NumChildren());
+ for (size_t i = 0; i < oldChildren.size(); ++i)
+ oldChildren[i] = &tree->Child(i);
+
+ /**
+ * If 'tree' is the root of the tree (i.e. if it has no parent), then we must
+ * create two new child nodes, distribute the children from the original node
+ * among them, and insert those. If 'tree' is not the root of the tree, then
+ * we may create only one new child node, redistribute the children from the
+ * original node between 'tree' and the new node, then insert those nodes into
+ * the parent.
+ *
+ * Here, we simply set treeOne and treeTwo to the right values to avoid code
+ * duplication.
+ */
+ TreeType* par = tree->Parent();
+ TreeType* treeOne = par ? tree : new TreeType(tree);
+ TreeType* treeTwo = par ? new TreeType(par) : new TreeType(tree);
+
+ // Now clean the node.
+ tree->numChildren = 0;
+ tree->numDescendants = 0;
+ tree->count = 0;
+ tree->bound.Clear();
+
+ // Assemble vector of values.
+ arma::Col<ElemType> values(oldChildren.size());
+ for (size_t i = 0; i < oldChildren.size(); ++i)
{
- for (size_t i = 0; i < sorted.size(); i++)
- {
- sorted[i].first = tree->Child(i).Bound()[bestAxis].Lo();
- sorted[i].second = &tree->Child(i);
- }
- }
- else
- {
- for (size_t i = 0; i < sorted.size(); i++)
- {
- sorted[i].first = tree->Child(i).Bound()[bestAxis].Hi();
- sorted[i].second = &tree->Child(i);
- }
+ values[i] = (lowIsBetter ? oldChildren[i]->Bound()[bestAxis].Lo()
+ : oldChildren[i]->Bound()[bestAxis].Hi());
}
+ arma::uvec indices = arma::sort_index(values);
- std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, TreeType*>);
+ for (size_t i = 0; i < bestIndex; ++i)
+ InsertNodeIntoTree(treeOne, oldChildren[indices[i]]);
+ for (size_t i = bestIndex; i < oldChildren.size(); ++i)
+ InsertNodeIntoTree(treeTwo, oldChildren[indices[i]]);
- if (tree->Parent() != NULL)
+ // Insert the new tree node(s).
+ if (par)
{
- const size_t oldNumDescendants = tree->NumDescendants();
-// for (size_t i = 0; i < tree->NumChildren(); ++i)
-// std::cout << tree->Child(i).NumDescendants() << " ";
-// std::cout << " (total " << tree->NumDescendants() << ", count " <<
-//tree->count << "\n";
- const size_t oldNumChildren = tree->NumChildren();
- tree->numChildren = 0;
- tree->bound.Clear();
- tree->count = 0;
- tree->begin = 0;
- tree->numDescendants = 0;
- TreeType* treeTwo = new TreeType(tree->Parent());
-
- if (tiedOnOverlap)
- {
- for (size_t i = 0; i < oldNumChildren; i++)
- {
- if (i < bestAreaIndexOnBestAxis + tree->MinNumChildren())
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into tree"
-// << "\n";
- InsertNodeIntoTree(tree, sorted[i].second);
- }
- else
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
- InsertNodeIntoTree(treeTwo, sorted[i].second);
- }
- }
- }
- else
- {
- for (size_t i = 0; i < oldNumChildren; i++)
- {
- if (i < bestOverlapIndexOnBestAxis + tree->MinNumChildren())
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into tree\n";
- InsertNodeIntoTree(tree, sorted[i].second);
- }
- else
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
- InsertNodeIntoTree(treeTwo, sorted[i].second);
- }
- }
- }
-
- // Insert the new node into the tree.
- TreeType* par = tree->Parent();
+ // Insert the new node into the parent. The number of descendants does not
+ // need to be updated.
par->children[par->NumChildren()++] = treeTwo;
-
- // We only add one at a time, so we should only need to test for equality
- // just in case, we use an assert.
- assert(par->NumChildren() <= par->MaxNumChildren() + 1);
- if (par->NumChildren() == par->MaxNumChildren() + 1)
- RStarTreeSplit::SplitNonLeafNode(par,relevels);
-
- // We have to update the children of each of these new nodes so that they
- // record the correct parent.
- for (size_t i = 0; i < tree->NumChildren(); i++)
- tree->children[i]->Parent() = tree;
-
- for (size_t i = 0; i < treeTwo->NumChildren(); i++)
- treeTwo->children[i]->Parent() = treeTwo;
-
-// std::cout << "tree left " << tree->NumDescendants() << ", right " <<
-//treeTwo->NumDescendants() << ", parent " << par->NumDescendants() << "\n";
-// for (size_t i = 0; i < par->NumChildren(); ++i)
-// std::cout << par->Child(i).NumDescendants() << " ";
-// std::cout << " (that's the children of the parent)\n";
- assert(oldNumDescendants == (tree->NumDescendants() +
-treeTwo->NumDescendants()));
-
-
- assert(tree->Parent()->NumChildren() <= tree->MaxNumChildren());
- assert(tree->Parent()->NumChildren() >= tree->MinNumChildren());
- assert(treeTwo->Parent()->NumChildren() <= treeTwo->MaxNumChildren());
- assert(treeTwo->Parent()->NumChildren() >= treeTwo->MinNumChildren());
-
- assert(tree->MaxNumChildren() < 7);
- assert(treeTwo->MaxNumChildren() < 7);
}
else
{
- const size_t oldDescendants = tree->NumDescendants();
-// for (size_t i = 0; i < tree->NumChildren(); ++i)
-// std::cout << tree->Child(i).NumDescendants() << " ";
-// std::cout << " (total " << tree->NumDescendants() << ", count " <<
-//tree->count << "\n";
- TreeType* treeOne = new TreeType(tree);
- TreeType* treeTwo = new TreeType(tree);
-
- const size_t oldNumChildren = tree->NumChildren();
- tree->count = 0;
- tree->numChildren = 0;
- tree->bound.Clear();
- tree->numDescendants = 0;
-
- if (tiedOnOverlap)
- {
- for (size_t i = 0; i < oldNumChildren; i++)
- {
- if (i < bestAreaIndexOnBestAxis + tree->MinNumChildren())
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeOne\n";
- InsertNodeIntoTree(treeOne, sorted[i].second);
- }
- else
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
- InsertNodeIntoTree(treeTwo, sorted[i].second);
- }
- }
- }
- else
- {
- for (size_t i = 0; i < oldNumChildren; i++)
- {
- if (i < bestOverlapIndexOnBestAxis + tree->MinNumChildren())
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeOne\n";
- InsertNodeIntoTree(treeOne, sorted[i].second);
- }
- else
- {
-// std::cout << "insert " << sorted[i].second->NumDescendants() << " into treeTwo\n";
- InsertNodeIntoTree(treeTwo, sorted[i].second);
- }
- }
- }
-
+ // Insert both nodes into 'tree', which is now a higher-level root node.
InsertNodeIntoTree(tree, treeOne);
InsertNodeIntoTree(tree, treeTwo);
-// std::cout << oldDescendants << "; " << treeOne->numDescendants << ", " <<
-//treeTwo->numDescendants << " --> " << tree->numDescendants << "\n";
- assert(oldDescendants == (treeOne->numDescendants +
- treeTwo->numDescendants));
-
- // We have to update the children of each of these new nodes so that they
- // record the correct parent.
+ // We have to update the children of treeOne so that they record the correct
+ // parent.
for (size_t i = 0; i < treeOne->NumChildren(); i++)
treeOne->children[i]->Parent() = treeOne;
-
- for (size_t i = 0; i < treeTwo->NumChildren(); i++)
- treeTwo->children[i]->Parent() = treeTwo;
}
+ // Update the children of treeTwo to have the correct parent.
+ for (size_t i = 0; i < treeTwo->NumChildren(); i++)
+ treeTwo->children[i]->Parent() = treeTwo;
+
+ // If we have overflowed hte parent's children, then we need to split that
+ // node also.
+ if (par && par->NumChildren() >= par->MaxNumChildren() + 1)
+ RStarTreeSplit::SplitNonLeafNode(par, relevels);
+
return false;
}
@@ -776,21 +446,9 @@ treeTwo->NumDescendants()));
template<typename TreeType>
void RStarTreeSplit::InsertNodeIntoTree(TreeType* destTree, TreeType* srcNode)
{
-// std::cout << "insert " << srcNode << " into " << destTree << "\n";
-
destTree->Bound() |= srcNode->Bound();
destTree->numDescendants += srcNode->numDescendants;
destTree->children[destTree->NumChildren()++] = srcNode;
-
-// std::cout << "dest now has " << destTree->NumDescendants() << "\n";
-// size_t manualCount = 0;
-// for (size_t i = 0; i < destTree->NumChildren(); ++i)
-// {
-// manualCount += destTree->Child(i).NumDescendants();
-// std::cout << destTree->Child(i).NumDescendants() << " ";
-// }
-// std::cout << "\n";
-// assert(manualCount == destTree->NumDescendants());
}
} // namespace tree
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list