[mlpack] 20/149: Implement Elkan's algorithm for k-means (it's pretty fast).

Barak A. Pearlmutter barak+git at pearlmutter.net
Sat May 2 09:11:04 UTC 2015


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch svn-trunk
in repository mlpack.

commit f3b12a683a03c59cc533a9fc5b330b698623f3ea
Author: rcurtin <rcurtin at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date:   Thu Oct 9 18:56:25 2014 +0000

    Implement Elkan's algorithm for k-means (it's pretty fast).
    
    
    git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@17221 9d5b8971-822b-0410-80eb-d18c1038ef23
---
 src/mlpack/methods/kmeans/elkan_kmeans.hpp      |  65 +++++++++
 src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp | 185 ++++++++++++++++++++++++
 2 files changed, 250 insertions(+)

diff --git a/src/mlpack/methods/kmeans/elkan_kmeans.hpp b/src/mlpack/methods/kmeans/elkan_kmeans.hpp
new file mode 100644
index 0000000..ea555fc
--- /dev/null
+++ b/src/mlpack/methods/kmeans/elkan_kmeans.hpp
@@ -0,0 +1,65 @@
+/**
+ * @file elkan_kmeans.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Elkan's algorithm for exact Lloyd iterations.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_HPP
+#define __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_HPP
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+class ElkanKMeans
+{
+ public:
+  /**
+   * Construct the ElkanKMeans object, which must store several sets of bounds.
+   */
+  ElkanKMeans(const MatType& dataset, MetricType& metric);
+
+  /**
+   * Run a single iteration of Elkan's algorithm, updating the given centroids
+   * into the newCentroids matrix.
+   *
+   * @param centroids Current cluster centroids.
+   * @param newCentroids New cluster centroids.
+   * @param counts Current counts, to be overwritten with new counts.
+   */
+  double Iterate(const arma::mat& centroids,
+                 arma::mat& newCentroids,
+                 arma::Col<size_t>& counts);
+
+  size_t DistanceCalculations() const { return distanceCalculations; }
+
+ private:
+  //! The dataset.
+  const MatType& dataset;
+  //! The instantiated metric.
+  MetricType& metric;
+
+  //! Holds intra-cluster distances.
+  arma::mat clusterDistances;
+  //! Half the distance from a cluster to its nearest cluster (s(c)).
+  arma::vec minClusterDistances;
+
+  //! Holds the index of the cluster that owns each point.
+  arma::Col<size_t> assignments;
+
+  //! Upper bounds on the distance between each point and its closest cluster.
+  arma::vec upperBounds;
+  //! Lower bounds on the distance between each point and each cluster.
+  arma::mat lowerBounds;
+
+  //! Track distance calculations.
+  size_t distanceCalculations;
+};
+
+} // namespace kmeans
+} // namespace mlpack
+
+// Include implementation.
+#include "elkan_kmeans_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp b/src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp
new file mode 100644
index 0000000..369324f
--- /dev/null
+++ b/src/mlpack/methods/kmeans/elkan_kmeans_impl.hpp
@@ -0,0 +1,185 @@
+/**
+ * @file elkan_kmeans_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of Elkan's algorithm for exact Lloyd iterations.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_ELKAN_KMEANS_IMPL_HPP
+
+#include "elkan_kmeans.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+ElkanKMeans<MetricType, MatType>::ElkanKMeans(const MatType& dataset,
+                                              MetricType& metric) :
+    dataset(dataset),
+    metric(metric),
+    distanceCalculations(0)
+{
+
+}
+
+// Run a single iteration of Elkan's algorithm for Lloyd iterations.
+template<typename MetricType, typename MatType>
+double ElkanKMeans<MetricType, MatType>::Iterate(const arma::mat& centroids,
+                                                 arma::mat& newCentroids,
+                                                 arma::Col<size_t>& counts)
+{
+  // Clear new centroids.
+  newCentroids.zeros(centroids.n_rows, centroids.n_cols);
+  counts.zeros(centroids.n_cols);
+
+  // At the beginning of the iteration, we must compute the distances between
+  // all centers.  This is O(k^2).
+  clusterDistances.set_size(centroids.n_cols, centroids.n_cols);
+
+  // Self-distances are always 0, but we set them to DBL_MAX to avoid the self
+  // being the closest cluster centroid.
+  clusterDistances.diag().fill(DBL_MAX);
+
+  // Initially set r(x) to true.
+  std::vector<bool> mustRecalculate(dataset.n_cols, true);
+
+  // If this is the first iteration, we must reset all the bounds.
+  if (lowerBounds.n_rows != centroids.n_cols)
+  {
+    lowerBounds.set_size(centroids.n_cols, dataset.n_cols);
+    assignments.set_size(dataset.n_cols);
+    upperBounds.set_size(dataset.n_cols);
+
+    lowerBounds.fill(0);
+    upperBounds.fill(DBL_MAX);
+    assignments.fill(0);
+  }
+
+  // Step 1: for all centers, compute between-cluster distances.  For all
+  // centers, compute s(c) = 1/2 min d(c, c').
+  for (size_t i = 0; i < centroids.n_cols; ++i)
+  {
+    for (size_t j = i + 1; j < centroids.n_cols; ++j)
+    {
+      const double distance = metric.Evaluate(centroids.col(i),
+                                              centroids.col(j));
+      distanceCalculations++;
+      clusterDistances(i, j) = distance;
+      clusterDistances(j, i) = distance;
+    }
+  }
+
+  // Now find the closest cluster to each other cluster.  We multiply by 0.5 so
+  // that this is equivalent to s(c) for each cluster c.
+  minClusterDistances = 0.5 * arma::min(clusterDistances).t();
+
+  // Now loop over all points, and see which ones need to be updated.
+  for (size_t i = 0; i < dataset.n_cols; ++i)
+  {
+    // Step 2: identify all points such that u(x) <= s(c(x)).
+    if (upperBounds(i) <= minClusterDistances(assignments[i]))
+    {
+      // No change needed.  This point must still belong to that cluster.
+      counts(assignments[i])++;
+      newCentroids.col(assignments[i]) += arma::vec(dataset.col(i));
+      continue;
+    }
+    else
+    {
+      for (size_t c = 0; c < centroids.n_cols; ++c)
+      {
+        // Step 3: for all remaining points x and centers c such that c != c(x),
+        // u(x) > l(x, c) and u(x) > 0.5 d(c(x), c)...
+        if (assignments[i] == c)
+          continue; // Pruned because this cluster is already the assignment.
+
+        if (upperBounds(i) <= lowerBounds(c, i))
+          continue; // Pruned by triangle inequality on lower bound.
+
+        if (upperBounds(i) <= 0.5 * clusterDistances(assignments[i], c))
+          continue; // Pruned by triangle inequality on cluster distances.
+
+        // Step 3a: if r(x) then compute d(x, c(x)) and assign r(x) = false.
+        // Otherwise, d(x, c(x)) = u(x).
+        double dist;
+        if (mustRecalculate[i])
+        {
+          mustRecalculate[i] = false;
+          dist = metric.Evaluate(dataset.col(i), centroids.col(assignments[i]));
+          lowerBounds(assignments[i], i) = dist;
+          upperBounds(i) = dist;
+          distanceCalculations++;
+
+          // Check if we can prune again.
+          if (upperBounds(i) <= lowerBounds(c, i))
+            continue; // Pruned by triangle inequality on lower bound.
+
+          if (upperBounds(i) <= 0.5 * clusterDistances(assignments[i], c))
+            continue; // Pruned by triangle inequality on cluster distances.
+        }
+        else
+        {
+          dist = upperBounds(i); // This is equivalent to d(x, c(x)).
+        }
+
+        // Step 3b: if d(x, c(x)) > l(x, c) or d(x, c(x)) > 0.5 d(c(x), c)...
+        if (dist > lowerBounds(c, i) ||
+            dist > 0.5 * clusterDistances(assignments[i], c))
+        {
+          // Compute d(x, c).  If d(x, c) < d(x, c(x)) then assign c(x) = c.
+          const double pointDist = metric.Evaluate(dataset.col(i),
+                                                   centroids.col(c));
+          lowerBounds(c, i) = pointDist;
+          distanceCalculations++;
+          if (pointDist < dist)
+          {
+            upperBounds(i) = pointDist;
+            assignments[i] = c;
+          }
+        }
+      }
+    }
+
+    // At this point, we know the new cluster assignment.
+    // Step 4: for each center c, let m(c) be the mean of the points assigned to
+    // c.
+    newCentroids.col(assignments[i]) += arma::vec(dataset.col(i));
+    counts[assignments[i]]++;
+  }
+
+  // Now, normalize and calculate the distance each cluster has moved.
+  arma::vec moveDistances(centroids.n_cols);
+  double cNorm = 0.0; // Cluster movement for residual.
+  for (size_t c = 0; c < centroids.n_cols; ++c)
+  {
+    if (counts[c] > 0)
+      newCentroids.col(c) /= counts[c];
+    else
+      newCentroids.fill(DBL_MAX); // Fill with invalid value.
+
+    moveDistances(c) = metric.Evaluate(newCentroids.col(c), centroids.col(c));
+    cNorm += std::pow(moveDistances(c), 2.0);
+    distanceCalculations++;
+  }
+
+  for (size_t i = 0; i < dataset.n_cols; ++i)
+  {
+    // Step 5: for each point x and center c, assign
+    //   l(x, c) = max { l(x, c) - d(c, m(c)), 0 }.
+    // But it doesn't actually matter if l(x, c) is positive.
+    for (size_t c = 0; c < centroids.n_cols; ++c)
+      lowerBounds(c, i) -= moveDistances(c);
+
+    // Step 6: for each point x, assign
+    //   u(x) = u(x) + d(m(c(x)), c(x))
+    //   r(x) = true (we are setting that at the start of every iteration).
+    upperBounds(i) += moveDistances(assignments[i]);
+  }
+
+  return sqrt(cNorm);
+}
+
+} // namespace kmeans
+} // namespace mlpack
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list