[mlpack] 09/324: Remove the Pelleg-Moore k-means implementation; it is being replaced.
Barak A. Pearlmutter
barak+git at cs.nuim.ie
Sun Aug 17 08:21:51 UTC 2014
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch svn-trunk
in repository mlpack.
commit 7fa87c559d84c10cd89b0169f1547be1ae9ee4d4
Author: rcurtin <rcurtin at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date: Wed May 21 16:58:08 2014 +0000
Remove the Pelleg-Moore k-means implementation; it is being replaced.
git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@16529 9d5b8971-822b-0410-80eb-d18c1038ef23
---
src/mlpack/methods/kmeans/kmeans.hpp | 12 +-
src/mlpack/methods/kmeans/kmeans_impl.hpp | 433 ------------------------------
src/mlpack/methods/kmeans/kmeans_main.cpp | 22 +-
3 files changed, 4 insertions(+), 463 deletions(-)
diff --git a/src/mlpack/methods/kmeans/kmeans.hpp b/src/mlpack/methods/kmeans/kmeans.hpp
index e070e5d..44883f4 100644
--- a/src/mlpack/methods/kmeans/kmeans.hpp
+++ b/src/mlpack/methods/kmeans/kmeans.hpp
@@ -142,16 +142,6 @@ class KMeans
const bool initialAssignmentGuess = false,
const bool initialCentroidGuess = false) const;
- /**
- * An implementation of k-means using the Pelleg-Moore algorithm; this is
- * known to not work -- do not use it! (Fixing it is TODO, of course; see
- * #251.)
- */
- template<typename MatType>
- void FastCluster(MatType& data,
- const size_t clusters,
- arma::Col<size_t>& assignments) const;
-
//! Return the overclustering factor.
double OverclusteringFactor() const { return overclusteringFactor; }
//! Set the overclustering factor. Must be greater than 1.
@@ -178,7 +168,7 @@ class KMeans
//! Modify the empty cluster policy.
EmptyClusterPolicy& EmptyClusterAction() { return emptyClusterAction; }
- // Returns a string representation of this object.
+ // Returns a string representation of this object.
std::string ToString() const;
private:
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 6d48517..1ef8c6f 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -49,439 +49,6 @@ KMeans(const size_t maxIterations,
}
}
-template<typename MetricType,
- typename InitialPartitionPolicy,
- typename EmptyClusterPolicy>
-template<typename MatType>
-void KMeans<
- MetricType,
- InitialPartitionPolicy,
- EmptyClusterPolicy>::
-FastCluster(MatType& data,
- const size_t clusters,
- arma::Col<size_t>& assignments) const
-{
- size_t actualClusters = size_t(overclusteringFactor * clusters);
- if (actualClusters > data.n_cols)
- {
- Log::Warn << "KMeans::Cluster(): overclustering factor is too large. No "
- << "overclustering will be done." << std::endl;
- actualClusters = clusters;
- }
-
- size_t dimensionality = data.n_rows;
-
- // Centroids of each cluster. Each column corresponds to a centroid.
- MatType centroids(dimensionality, actualClusters);
- centroids.zeros();
-
- // Counts of points in each cluster.
- arma::Col<size_t> counts(actualClusters);
- counts.zeros();
-
- // Build the mrkd-tree on this dataset.
- tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic>
- tree(data, 1);
- Log::Debug << "Tree Built." << std::endl;
- // A pointer for traversing the mrkd-tree.
- tree::BinarySpaceTree<typename bound::HRectBound<2>, tree::MRKDStatistic>*
- node;
-
- // Now, the initial assignments. First determine if they are necessary.
- if (assignments.n_elem != data.n_cols)
- {
- // Use the partitioner to come up with the partition assignments.
- partitioner.Cluster(data, actualClusters, assignments);
- }
-
- // Set counts correctly.
- for (size_t i = 0; i < assignments.n_elem; i++)
- counts[assignments[i]]++;
-
- // Sum the points for each centroid
- for (size_t i = 0; i < data.n_cols; i++)
- centroids.col(assignments[i]) += data.col(i);
-
- // Then divide the sums by the count to get the center of mass for this
- // centroids assigned points
- for (size_t i = 0; i < actualClusters; i++)
- centroids.col(i) /= counts[i];
-
- // Instead of retraversing the tree after an iteration, we will update
- // centroid positions in this matrix, which also prevents clobbering our
- // centroids from the previous iteration.
- MatType newCentroids(dimensionality, centroids.n_cols);
-
- // Create a stack for traversing the mrkd-tree.
- std::stack<typename tree::BinarySpaceTree<typename bound::HRectBound<2>,
- tree::MRKDStatistic>* > stack;
-
- // A variable to keep track of how many kmeans iterations we have made.
- size_t iteration = 0;
-
- // A variable to keep track of how many nodes assignments have changed in
- // each kmeans iteration.
- size_t changedAssignments = 0;
-
- // A variable to keep track of the number of times something is skipped due
- // to the blacklist.
- size_t skip = 0;
-
- // A variable to keep track of the number of distances calculated.
- size_t comps = 0;
-
- // A variable to keep track of how often we stop at a parent node.
- size_t dominations = 0;
- do
- {
- // Keep track of what iteration we are on.
- ++iteration;
- changedAssignments = 0;
-
- // Reset the newCentroids so that we can store the newly calculated ones
- // here.
- newCentroids.zeros();
-
- // Reset the counts.
- counts.zeros();
-
- // Add the root node of the tree to the stack.
- stack.push(&tree);
- // Set the top level whitelist.
- tree.Stat().Whitelist().resize(centroids.n_cols, true);
-
- // Traverse the tree.
- while (!stack.empty())
- {
- // Get the next node in the tree.
- node = stack.top();
- // Remove the node from the stack.
- stack.pop();
-
- // Get a reference to the mrkd statistic for this hyperrectangle.
- tree::MRKDStatistic& mrkd = node->Stat();
-
- // We use this to store the index of the centroid with the minimum
- // distance from this hyperrectangle or point.
- size_t minIndex = 0;
-
- // If this node is a leaf, then we calculate the distance from
- // the centroids to every point the node contains.
- if (node->IsLeaf())
- {
- for (size_t i = mrkd.Begin(); i < mrkd.Count() + mrkd.Begin(); ++i)
- {
- // Initialize minDistance to be nonzero.
- double minDistance = metric.Evaluate(data.col(i), centroids.col(0));
-
- // Find the minimal distance centroid for this point.
- for (size_t j = 1; j < centroids.n_cols; ++j)
- {
- // If this centroid is not in the whitelist, skip it.
- if (!mrkd.Whitelist()[j])
- {
- ++skip;
- continue;
- }
-
- ++comps;
- double distance = metric.Evaluate(data.col(i), centroids.col(j));
- if (minDistance > distance)
- {
- minIndex = j;
- minDistance = distance;
- }
- }
-
- // Add this point to the undivided center of mass summation for its
- // assigned centroid.
- newCentroids.col(minIndex) += data.col(i);
-
- // Increment the count for the minimum distance centroid.
- ++counts(minIndex);
-
- // If we actually changed assignments, increment changedAssignments
- // and modify the assignment vector for this point.
- if (assignments(i) != minIndex)
- {
- ++changedAssignments;
- assignments(i) = minIndex;
- }
- }
- }
- // If this node is not a leaf, then we continue trying to find dominant
- // centroids.
- else
- {
- bound::HRectBound<2>& bound = node->Bound();
-
- // A flag to keep track of if we find a single centroid that is closer
- // to all points in this hyperrectangle than any other centroid.
- bool noDomination = false;
-
- // Calculate the center of mass of this hyperrectangle.
- arma::vec center = mrkd.CenterOfMass() / mrkd.Count();
-
- // Set the minDistance to the maximum value of a double so any value
- // must be smaller than this.
- double minDistance = std::numeric_limits<double>::max();
-
- // The candidate distance we calculate for each centroid.
- double distance = 0.0;
-
- // How many points are inside this hyperrectangle, we stop if we
- // see more than 1.
- size_t contains = 0;
-
- // Find the "owner" of this hyperrectangle, if one exists.
- for (size_t i = 0; i < centroids.n_cols; ++i)
- {
- // If this centroid is not in the whitelist, skip it.
- if (!mrkd.Whitelist()[i])
- {
- ++skip;
- continue;
- }
-
- // Incrememnt the number of distance calculations for what we are
- // about to do.
- comps += 2;
-
- // Reinitialize the distance so += works right.
- distance = 0.0;
-
- // We keep track of how many dimensions have nonzero distance,
- // if this is 0 then the distance is 0.
- size_t nonZero = 0;
-
- /*
- Compute the distance to the hyperrectangle for this centroid.
- We do this by finding the furthest point from the centroid inside
- the hyperrectangle. This is a corner of the hyperrectangle.
-
- In order to do this faster, we calculate both the distance and the
- furthest point simultaneously.
-
- This following code is equivalent to, but faster than:
-
- arma::vec p;
- p.zeros(dimensionality);
-
- for (size_t j = 0; j < dimensionality; ++j)
- {
- if (centroids(j,i) < bound[j].Lo())
- p(j) = bound[j].Lo();
- else
- p(j) = bound[j].Hi();
- }
-
- distance = metric.Evaluate(p.col(0), centroids.col(i));
- */
- for (size_t j = 0; j < dimensionality; ++j)
- {
- double ij = centroids(j,i);
- double lo = bound[j].Lo();
-
- if (ij < lo)
- {
- // (ij - lo)^2
- ij -= lo;
- ij *= ij;
-
- distance += ij;
- ++nonZero;
- }
- else
- {
- double hi = bound[j].Hi();
- if (ij > hi)
- {
- // (ij - hi)^2
- ij -= hi;
- ij *= ij;
-
- distance += ij;
- ++nonZero;
- }
- }
- }
-
- // The centroid is inside the hyperrectangle.
- if (nonZero == 0)
- {
- ++contains;
- minDistance = 0.0;
- minIndex = i;
-
- // If more than two points are within this hyperrectangle, then
- // there can be no dominating centroid, so we should continue
- // to the children nodes.
- if (contains > 1)
- {
- noDomination = true;
- break;
- }
- }
-
- if (fabs(distance - minDistance) <= 1e-10)
- {
- noDomination = true;
- break;
- }
- else if (distance < minDistance)
- {
- minIndex = i;
- minDistance = distance;
- }
- }
-
- distance = minDistance;
- // Determine if the owner dominates this centroid only if there was
- // exactly one owner.
- if (!noDomination)
- {
- for (size_t i = 0; i < centroids.n_cols; ++i)
- {
- if (i == minIndex)
- continue;
- // If this centroid is blacklisted for this hyperrectangle, then
- // we skip it.
- if (!mrkd.Whitelist()[i])
- {
- ++skip;
- continue;
- }
- /*
- Compute the dominating centroid for this hyperrectangle, if one
- exists. We do this by calculating the point which is furthest
- from the min'th centroid in the direction of c_k - c_min. We do
- this as outlined in the Pelleg and Moore paper.
-
- This following code is equivalent to, but faster than:
-
- arma::vec p;
- p.zeros(dimensionality);
-
- for (size_t k = 0; k < dimensionality; ++k)
- {
- p(k) = (centroids(k,i) > centroids(k,minIndex)) ?
- bound[k].Hi() : bound[k].Lo();
- }
-
- double distancei = metric.Evaluate(p.col(0), centroids.col(i));
- double distanceMin = metric.Evaluate(p.col(0),
- centroids.col(minIndex));
- */
-
- comps += 1;
- double distancei = 0.0;
- double distanceMin = 0.0;
- for (size_t k = 0; k < dimensionality; ++k)
- {
- double ci = centroids(k, i);
- double cm = centroids(k, minIndex);
- if (ci > cm)
- {
- double hi = bound[k].Hi();
-
- ci -= hi;
- cm -= hi;
-
- ci *= ci;
- cm *= cm;
-
- distancei += ci;
- distanceMin += cm;
- }
- else
- {
- double lo = bound[k].Lo();
-
- ci -= lo;
- cm -= lo;
-
- ci *= ci;
- cm *= cm;
-
- distancei += ci;
- distanceMin += cm;
- }
- }
-
- if (distanceMin >= distancei)
- {
- noDomination = true;
- break;
- }
- else
- {
- mrkd.Whitelist()[i] = false;
- }
- }
- }
-
- // If did found a centroid that was closer to every point in the
- // hyperrectangle than every other centroid, then update that centroid.
- if (!noDomination)
- {
- // Adjust the new centroid sum for the min distance point to this
- // hyperrectangle by the center of mass of this hyperrectangle.
- newCentroids.col(minIndex) += mrkd.CenterOfMass();
-
- // Increment the counts for this centroid.
- counts(minIndex) += mrkd.Count();
-
- // Update all assignments for this node.
- const size_t begin = node->Begin();
- const size_t end = node->End();
-
- // TODO: Do this outside of the kmeans iterations.
- for (size_t j = begin; j < end; ++j)
- {
- if (assignments(j) != minIndex)
- {
- ++changedAssignments;
- assignments(j) = minIndex;
- }
- }
- mrkd.DominatingCentroid() = minIndex;
-
- // Keep track of the number of times we found a dominating centroid.
- ++dominations;
- }
-
- // If we did not find a dominating centroid then we fall through to the
- // default case, where we add the children of this node to the stack.
- else
- {
- // Add this hyperrectangle's children to our stack.
- stack.push(node->Left());
- stack.push(node->Right());
-
- // (Re)Initialize the whiteList for the children.
- node->Left()->Stat().Whitelist() = mrkd.Whitelist();
- node->Right()->Stat().Whitelist() = mrkd.Whitelist();
- }
- }
-
- }
-
- // Divide by the number of points assigned to the centroids so that we
- // have the actual center of mass and update centroids' positions.
- for (size_t i = 0; i < centroids.n_cols; ++i)
- if (counts(i))
- centroids.col(i) = newCentroids.col(i) / counts(i);
-
- // Stop when we reach max iterations or we changed no assignments
- // assignments.
- } while (changedAssignments > 0 && iteration != maxIterations);
-
- Log::Info << "Iterations: " << iteration << std::endl
- << "Skips: " << skip << std::endl
- << "Comparisons: " << comps << std::endl
- << "Dominations: " << dominations << std::endl;
-}
-
/**
* Perform k-means clustering on the data, returning a list of cluster
* assignments. This just forward to the other function, which returns the
diff --git a/src/mlpack/methods/kmeans/kmeans_main.cpp b/src/mlpack/methods/kmeans/kmeans_main.cpp
index bd1458a..3f81020 100644
--- a/src/mlpack/methods/kmeans/kmeans_main.cpp
+++ b/src/mlpack/methods/kmeans/kmeans_main.cpp
@@ -55,10 +55,6 @@ PARAM_INT("seed", "Random seed. If 0, 'std::time(NULL)' is used.", "s", 0);
PARAM_STRING("initial_centroids", "Start with the specified initial centroids.",
"I", "");
-// This is known to not work (#251).
-//PARAM_FLAG("fast_kmeans", "Use the experimental fast k-means algorithm by "
-// "Pelleg and Moore.", "f");
-
// Parameters for "refined start" k-means.
PARAM_FLAG("refined_start", "Use the refined initial point strategy by Bradley "
"and Fayyad to choose initial points.", "r");
@@ -151,9 +147,6 @@ int main(int argc, char** argv)
RefinedStart(samplings, percentage));
Timer::Start("clustering");
-// if (CLI::HasParam("fast_kmeans"))
-// k.FastCluster(dataset, clusters, assignments);
-// else
k.Cluster(dataset, clusters, assignments, centroids);
Timer::Stop("clustering");
}
@@ -163,9 +156,6 @@ int main(int argc, char** argv)
AllowEmptyClusters> k(maxIterations, overclustering);
Timer::Start("clustering");
-// if (CLI::HasParam("fast_kmeans"))
-// k.FastCluster(dataset, clusters, assignments);
-// else
k.Cluster(dataset, clusters, assignments, centroids, false,
initialCentroidGuess);
Timer::Stop("clustering");
@@ -190,10 +180,7 @@ int main(int argc, char** argv)
RefinedStart(samplings, percentage));
Timer::Start("clustering");
-// if (CLI::HasParam("fast_kmeans"))
-// k.FastCluster(dataset, clusters, assignments);
-// else
- k.Cluster(dataset, clusters, assignments, centroids);
+ k.Cluster(dataset, clusters, assignments, centroids);
Timer::Stop("clustering");
}
else
@@ -201,11 +188,8 @@ int main(int argc, char** argv)
KMeans<> k(maxIterations, overclustering);
Timer::Start("clustering");
-// if (CLI::HasParam("fast_kmeans"))
-// k.FastCluster(dataset, clusters, assignments);
-// else
- k.Cluster(dataset, clusters, assignments, centroids, false,
- initialCentroidGuess);
+ k.Cluster(dataset, clusters, assignments, centroids, false,
+ initialCentroidGuess);
Timer::Stop("clustering");
}
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list