[mlpack] 20/149: Implement Elkan's algorithm for k-means (it's pretty fast).
Barak A. Pearlmutter
barak+git at pearlmutter.net
Sat May 2 09:11:04 UTC 2015
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch svn-trunk
in repository mlpack.
commit f3b12a683a03c59cc533a9fc5b330b698623f3ea
Author: rcurtin <rcurtin at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date: Thu Oct 9 18:56:25 2014 +0000
Implement Elkan's algorithm for k-means (it's pretty fast).
git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@17221 9d5b8971-822b-0410-80eb-d18c1038ef23
---
src/mlpack/methods/kmeans/elkan_kmeans.hpp | 65 +++++++++
src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp | 185 ++++++++++++++++++++++++
2 files changed, 250 insertions(+)
diff --git a/src/mlpack/methods/kmeans/elkan_kmeans.hpp b/src/mlpack/methods/kmeans/elkan_kmeans.hpp
new file mode 100644
index 0000000..ea555fc
--- /dev/null
+++ b/src/mlpack/methods/kmeans/elkan_kmeans.hpp
@@ -0,0 +1,65 @@
+/**
+ * @file elkan_kmeans.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Elkan's algorithm for exact Lloyd iterations.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_HPP
+#define __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_HPP
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+class ElkanKMeans
+{
+ public:
+ /**
+ * Construct the ElkanKMeans object, which must store several sets of bounds.
+ */
+ ElkanKMeans(const MatType& dataset, MetricType& metric);
+
+ /**
+ * Run a single iteration of Elkan's algorithm, updating the given centroids
+ * into the newCentroids matrix.
+ *
+ * @param centroids Current cluster centroids.
+ * @param newCentroids New cluster centroids.
+ * @param counts Current counts, to be overwritten with new counts.
+ */
+ double Iterate(const arma::mat& centroids,
+ arma::mat& newCentroids,
+ arma::Col<size_t>& counts);
+
+ size_t DistanceCalculations() const { return distanceCalculations; }
+
+ private:
+ //! The dataset.
+ const MatType& dataset;
+ //! The instantiated metric.
+ MetricType& metric;
+
+ //! Holds intra-cluster distances.
+ arma::mat clusterDistances;
+ //! Half the distance from a cluster to its nearest cluster (s(c)).
+ arma::vec minClusterDistances;
+
+ //! Holds the index of the cluster that owns each point.
+ arma::Col<size_t> assignments;
+
+ //! Upper bounds on the distance between each point and its closest cluster.
+ arma::vec upperBounds;
+ //! Lower bounds on the distance between each point and each cluster.
+ arma::mat lowerBounds;
+
+ //! Track distance calculations.
+ size_t distanceCalculations;
+};
+
+} // namespace kmeans
+} // namespace mlpack
+
+// Include implementation.
+#include "elkan_kmeans_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp b/src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp
new file mode 100644
index 0000000..369324f
--- /dev/null
+++ b/src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp
@@ -0,0 +1,185 @@
+/**
+ * @file elkan_kmeans_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Elkan's algorithm for exact Lloyd iterations.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_IMPL_HPP
+
+#include "elkan_kmeans.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+ElkanKMeans<MetricType, MatType>::ElkanKMeans(const MatType& dataset,
+ MetricType& metric) :
+ dataset(dataset),
+ metric(metric),
+ distanceCalculations(0)
+{
+
+}
+
+// Run a single iteration of Elkan's algorithm for Lloyd iterations.
+template<typename MetricType, typename MatType>
+double ElkanKMeans<MetricType, MatType>::Iterate(const arma::mat& centroids,
+ arma::mat& newCentroids,
+ arma::Col<size_t>& counts)
+{
+ // Clear new centroids.
+ newCentroids.zeros(centroids.n_rows, centroids.n_cols);
+ counts.zeros(centroids.n_cols);
+
+ // At the beginning of the iteration, we must compute the distances between
+ // all centers. This is O(k^2).
+ clusterDistances.set_size(centroids.n_cols, centroids.n_cols);
+
+ // Self-distances are always 0, but we set them to DBL_MAX to avoid the self
+ // being the closest cluster centroid.
+ clusterDistances.diag().fill(DBL_MAX);
+
+ // Initially set r(x) to true.
+ std::vector<bool> mustRecalculate(dataset.n_cols, true);
+
+ // If this is the first iteration, we must reset all the bounds.
+ if (lowerBounds.n_rows != centroids.n_cols)
+ {
+ lowerBounds.set_size(centroids.n_cols, dataset.n_cols);
+ assignments.set_size(dataset.n_cols);
+ upperBounds.set_size(dataset.n_cols);
+
+ lowerBounds.fill(0);
+ upperBounds.fill(DBL_MAX);
+ assignments.fill(0);
+ }
+
+ // Step 1: for all centers, compute between-cluster distances. For all
+ // centers, compute s(c) = 1/2 min d(c, c').
+ for (size_t i = 0; i < centroids.n_cols; ++i)
+ {
+ for (size_t j = i + 1; j < centroids.n_cols; ++j)
+ {
+ const double distance = metric.Evaluate(centroids.col(i),
+ centroids.col(j));
+ distanceCalculations++;
+ clusterDistances(i, j) = distance;
+ clusterDistances(j, i) = distance;
+ }
+ }
+
+ // Now find the closest cluster to each other cluster. We multiply by 0.5 so
+ // that this is equivalent to s(c) for each cluster c.
+ minClusterDistances = 0.5 * arma::min(clusterDistances).t();
+
+ // Now loop over all points, and see which ones need to be updated.
+ for (size_t i = 0; i < dataset.n_cols; ++i)
+ {
+ // Step 2: identify all points such that u(x) <= s(c(x)).
+ if (upperBounds(i) <= minClusterDistances(assignments[i]))
+ {
+ // No change needed. This point must still belong to that cluster.
+ counts(assignments[i])++;
+ newCentroids.col(assignments[i]) += arma::vec(dataset.col(i));
+ continue;
+ }
+ else
+ {
+ for (size_t c = 0; c < centroids.n_cols; ++c)
+ {
+ // Step 3: for all remaining points x and centers c such that c != c(x),
+ // u(x) > l(x, c) and u(x) > 0.5 d(c(x), c)...
+ if (assignments[i] == c)
+ continue; // Pruned because this cluster is already the assignment.
+
+ if (upperBounds(i) <= lowerBounds(c, i))
+ continue; // Pruned by triangle inequality on lower bound.
+
+ if (upperBounds(i) <= 0.5 * clusterDistances(assignments[i], c))
+ continue; // Pruned by triangle inequality on cluster distances.
+
+ // Step 3a: if r(x) then compute d(x, c(x)) and assign r(x) = false.
+ // Otherwise, d(x, c(x)) = u(x).
+ double dist;
+ if (mustRecalculate[i])
+ {
+ mustRecalculate[i] = false;
+ dist = metric.Evaluate(dataset.col(i), centroids.col(assignments[i]));
+ lowerBounds(assignments[i], i) = dist;
+ upperBounds(i) = dist;
+ distanceCalculations++;
+
+ // Check if we can prune again.
+ if (upperBounds(i) <= lowerBounds(c, i))
+ continue; // Pruned by triangle inequality on lower bound.
+
+ if (upperBounds(i) <= 0.5 * clusterDistances(assignments[i], c))
+ continue; // Pruned by triangle inequality on cluster distances.
+ }
+ else
+ {
+ dist = upperBounds(i); // This is equivalent to d(x, c(x)).
+ }
+
+ // Step 3b: if d(x, c(x)) > l(x, c) or d(x, c(x)) > 0.5 d(c(x), c)...
+ if (dist > lowerBounds(c, i) ||
+ dist > 0.5 * clusterDistances(assignments[i], c))
+ {
+ // Compute d(x, c). If d(x, c) < d(x, c(x)) then assign c(x) = c.
+ const double pointDist = metric.Evaluate(dataset.col(i),
+ centroids.col(c));
+ lowerBounds(c, i) = pointDist;
+ distanceCalculations++;
+ if (pointDist < dist)
+ {
+ upperBounds(i) = pointDist;
+ assignments[i] = c;
+ }
+ }
+ }
+ }
+
+ // At this point, we know the new cluster assignment.
+ // Step 4: for each center c, let m(c) be the mean of the points assigned to
+ // c.
+ newCentroids.col(assignments[i]) += arma::vec(dataset.col(i));
+ counts[assignments[i]]++;
+ }
+
+ // Now, normalize and calculate the distance each cluster has moved.
+ arma::vec moveDistances(centroids.n_cols);
+ double cNorm = 0.0; // Cluster movement for residual.
+ for (size_t c = 0; c < centroids.n_cols; ++c)
+ {
+ if (counts[c] > 0)
+ newCentroids.col(c) /= counts[c];
+ else
+ newCentroids.fill(DBL_MAX); // Fill with invalid value.
+
+ moveDistances(c) = metric.Evaluate(newCentroids.col(c), centroids.col(c));
+ cNorm += std::pow(moveDistances(c), 2.0);
+ distanceCalculations++;
+ }
+
+ for (size_t i = 0; i < dataset.n_cols; ++i)
+ {
+ // Step 5: for each point x and center c, assign
+ // l(x, c) = max { l(x, c) - d(c, m(c)), 0 }.
+ // But it doesn't actually matter if l(x, c) is positive.
+ for (size_t c = 0; c < centroids.n_cols; ++c)
+ lowerBounds(c, i) -= moveDistances(c);
+
+ // Step 6: for each point x, assign
+ // u(x) = u(x) + d(m(c(x)), c(x))
+ // r(x) = true (we are setting that at the start of every iteration).
+ upperBounds(i) += moveDistances(assignments[i]);
+ }
+
+ return sqrt(cNorm);
+}
+
+} // namespace kmeans
+} // namespace mlpack
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list