[mlpack] 244/324: Refactor KMeans so that the actual Lloyd iteration step is separate, since there are many ways to do a Lloyd iteration.
Barak A. Pearlmutter
barak+git at cs.nuim.ie
Sun Aug 17 08:22:15 UTC 2014
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch svn-trunk
in repository mlpack.
commit 507f1e273c2f5d672da3ea983e9c1929ae39e67e
Author: rcurtin <rcurtin at 9d5b8971-822b-0410-80eb-d18c1038ef23>
Date: Tue Jul 29 22:04:27 2014 +0000
Refactor KMeans so that the actual Lloyd iteration step is separate, since there
are many ways to do a Lloyd iteration.
git-svn-id: http://svn.cc.gatech.edu/fastlab/mlpack/trunk@16923 9d5b8971-822b-0410-80eb-d18c1038ef23
---
src/mlpack/methods/kmeans/CMakeLists.txt | 2 +
src/mlpack/methods/kmeans/allow_empty_clusters.hpp | 2 +-
src/mlpack/methods/kmeans/kmeans.hpp | 17 +-
src/mlpack/methods/kmeans/kmeans_impl.hpp | 183 ++++++++++++---------
.../methods/kmeans/max_variance_new_cluster.hpp | 2 +-
.../kmeans/max_variance_new_cluster_impl.hpp | 9 +-
src/mlpack/methods/kmeans/naive_kmeans.hpp | 45 +++++
src/mlpack/methods/kmeans/naive_kmeans_impl.hpp | 68 ++++++++
8 files changed, 238 insertions(+), 90 deletions(-)
diff --git a/src/mlpack/methods/kmeans/CMakeLists.txt b/src/mlpack/methods/kmeans/CMakeLists.txt
index 85995a0..1e61c30 100644
--- a/src/mlpack/methods/kmeans/CMakeLists.txt
+++ b/src/mlpack/methods/kmeans/CMakeLists.txt
@@ -6,6 +6,8 @@ set(SOURCES
kmeans_impl.hpp
max_variance_new_cluster.hpp
max_variance_new_cluster_impl.hpp
+ naive_kmeans.hpp
+ naive_kmeans_impl.hpp
random_partition.hpp
refined_start.hpp
refined_start_impl.hpp
diff --git a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
index ac956b3..ae476b7 100644
--- a/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
+++ b/src/mlpack/methods/kmeans/allow_empty_clusters.hpp
@@ -39,7 +39,7 @@ class AllowEmptyClusters
template<typename MatType>
static size_t EmptyCluster(const MatType& /* data */,
const size_t /* emptyCluster */,
- const MatType& /* centroids */,
+ const arma::mat& /* centroids */,
arma::Col<size_t>& /* clusterCounts */,
arma::Col<size_t>& /* assignments */)
{
diff --git a/src/mlpack/methods/kmeans/kmeans.hpp b/src/mlpack/methods/kmeans/kmeans.hpp
index e060905..06c3e5e 100644
--- a/src/mlpack/methods/kmeans/kmeans.hpp
+++ b/src/mlpack/methods/kmeans/kmeans.hpp
@@ -12,6 +12,7 @@
#include <mlpack/core/metrics/lmetric.hpp>
#include "random_partition.hpp"
#include "max_variance_new_cluster.hpp"
+#include "naive_kmeans.hpp"
#include <mlpack/core/tree/binary_space_tree.hpp>
@@ -51,12 +52,16 @@ namespace kmeans /** K-Means clustering. */ {
* @tparam EmptyClusterPolicy Policy for what to do on an empty cluster; must
* implement a default constructor and 'void EmptyCluster(const arma::mat&,
* arma::Col<size_t&)'.
+ * @tparam LloydStepType Implementation of single Lloyd step to use.
*
- * @see RandomPartition, RefinedStart, AllowEmptyClusters, MaxVarianceNewCluster
+ * @see RandomPartition, RefinedStart, AllowEmptyClusters,
+ * MaxVarianceNewCluster, NaiveKMeans
*/
template<typename MetricType = metric::EuclideanDistance,
typename InitialPartitionPolicy = RandomPartition,
- typename EmptyClusterPolicy = MaxVarianceNewCluster>
+ typename EmptyClusterPolicy = MaxVarianceNewCluster,
+ template<class, class> class LloydStepType = NaiveKMeans,
+ typename MatType = arma::mat>
class KMeans
{
public:
@@ -102,11 +107,10 @@ class KMeans
* @param initialGuess If true, then it is assumed that assignments has a list
* of initial cluster assignments.
*/
- template<typename MatType>
void Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments,
- const bool initialGuess = false) const;
+ const bool initialGuess = false);
/**
* Perform k-means clustering on the data, returning a list of cluster
@@ -134,13 +138,12 @@ class KMeans
* @param initialCentroidGuess If true, then it is assumed that centroids
* contains the initial centroids of each cluster.
*/
- template<typename MatType>
void Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments,
- MatType& centroids,
+ arma::mat& centroids,
const bool initialAssignmentGuess = false,
- const bool initialCentroidGuess = false) const;
+ const bool initialCentroidGuess = false);
//! Return the overclustering factor.
double OverclusteringFactor() const { return overclusteringFactor; }
diff --git a/src/mlpack/methods/kmeans/kmeans_impl.hpp b/src/mlpack/methods/kmeans/kmeans_impl.hpp
index 1ef8c6f..936831c 100644
--- a/src/mlpack/methods/kmeans/kmeans_impl.hpp
+++ b/src/mlpack/methods/kmeans/kmeans_impl.hpp
@@ -10,9 +10,6 @@
#include <mlpack/core/tree/mrkd_statistic.hpp>
#include <mlpack/core/metrics/lmetric.hpp>
-#include <stack>
-#include <limits>
-
namespace mlpack {
namespace kmeans {
@@ -21,11 +18,15 @@ namespace kmeans {
*/
template<typename MetricType,
typename InitialPartitionPolicy,
- typename EmptyClusterPolicy>
+ typename EmptyClusterPolicy,
+ template<class, class> class LloydStepType,
+ typename MatType>
KMeans<
MetricType,
InitialPartitionPolicy,
- EmptyClusterPolicy>::
+ EmptyClusterPolicy,
+ LloydStepType,
+ MatType>::
KMeans(const size_t maxIterations,
const double overclusteringFactor,
const MetricType metric,
@@ -57,18 +58,21 @@ KMeans(const size_t maxIterations,
*/
template<typename MetricType,
typename InitialPartitionPolicy,
- typename EmptyClusterPolicy>
-template<typename MatType>
+ typename EmptyClusterPolicy,
+ template<class, class> class LloydStepType,
+ typename MatType>
inline void KMeans<
MetricType,
InitialPartitionPolicy,
- EmptyClusterPolicy>::
+ EmptyClusterPolicy,
+ LloydStepType,
+ MatType>::
Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments,
- const bool initialGuess) const
+ const bool initialGuess)
{
- MatType centroids(data.n_rows, clusters);
+ arma::mat centroids(data.n_rows, clusters);
Cluster(data, clusters, assignments, centroids, initialGuess);
}
@@ -78,18 +82,21 @@ Cluster(const MatType& data,
*/
template<typename MetricType,
typename InitialPartitionPolicy,
- typename EmptyClusterPolicy>
-template<typename MatType>
+ typename EmptyClusterPolicy,
+ template<class, class> class LloydStepType,
+ typename MatType>
void KMeans<
MetricType,
InitialPartitionPolicy,
- EmptyClusterPolicy>::
+ EmptyClusterPolicy,
+ LloydStepType,
+ MatType>::
Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments,
- MatType& centroids,
+ arma::mat& centroids,
const bool initialAssignmentGuess,
- const bool initialCentroidGuess) const
+ const bool initialCentroidGuess)
{
// Make sure we have more points than clusters.
if (clusters > data.n_cols)
@@ -105,6 +112,9 @@ Cluster(const MatType& data,
actualClusters = clusters;
}
+ // Counts of points in each cluster.
+ arma::Col<size_t> counts(actualClusters);
+
// Now, the initial assignments. First determine if they are necessary.
if (initialAssignmentGuess)
{
@@ -112,6 +122,19 @@ Cluster(const MatType& data,
Log::Fatal << "KMeans::Cluster(): initial cluster assignments (length "
<< assignments.n_elem << ") not the same size as the dataset (size "
<< data.n_cols << ")!" << std::endl;
+
+ // Calculate initial centroids.
+ counts.zeros(actualClusters);
+ centroids.zeros(data.n_rows, actualClusters);
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ centroids.col(assignments[i]) += data.col(i);
+ counts[assignments[i]]++;
+ }
+
+ for (size_t i = 0; i < actualClusters; ++i)
+ if (counts[i] != 0)
+ centroids.col(i) /= counts[i];
}
else if (initialCentroidGuess)
{
@@ -153,65 +176,36 @@ Cluster(const MatType& data,
{
// Use the partitioner to come up with the partition assignments.
partitioner.Cluster(data, actualClusters, assignments);
- }
-
- // Counts of points in each cluster.
- arma::Col<size_t> counts(actualClusters);
- counts.zeros();
- // Resize to correct size.
- centroids.set_size(data.n_rows, actualClusters);
+ // Calculate initial centroids.
+ counts.zeros(actualClusters);
+ centroids.zeros(data.n_rows, actualClusters);
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ centroids.col(assignments[i]) += data.col(i);
+ counts[assignments[i]]++;
+ }
- // Set counts correctly.
- for (size_t i = 0; i < assignments.n_elem; i++)
- counts[assignments[i]]++;
+ for (size_t i = 0; i < actualClusters; ++i)
+ if (counts[i] != 0)
+ centroids.col(i) /= counts[i];
+ }
size_t changedAssignments = 0;
size_t iteration = 0;
- do
- {
- // Update step.
- // Calculate centroids based on given assignments.
- centroids.zeros();
-
- for (size_t i = 0; i < data.n_cols; i++)
- centroids.col(assignments[i]) += data.col(i);
-
- for (size_t i = 0; i < actualClusters; i++)
- centroids.col(i) /= counts[i];
-
- // Assignment step.
- // Find the closest centroid to each point. We will keep track of how many
- // assignments change. When no assignments change, we are done.
- changedAssignments = 0;
- for (size_t i = 0; i < data.n_cols; i++)
- {
- // Find the closest centroid to this point.
- double minDistance = std::numeric_limits<double>::infinity();
- size_t closestCluster = actualClusters; // Invalid value.
-
- for (size_t j = 0; j < actualClusters; j++)
- {
- double distance = metric.Evaluate(data.col(i), centroids.col(j));
- if (distance < minDistance)
- {
- minDistance = distance;
- closestCluster = j;
- }
- }
+ LloydStepType<MetricType, MatType> lloydStep(data, metric);
+ arma::mat centroidsOther;
+ double cNorm;
- // Reassign this point to the closest cluster.
- if (assignments[i] != closestCluster)
- {
- // Update counts.
- counts[assignments[i]]--;
- counts[closestCluster]++;
- // Update assignment.
- assignments[i] = closestCluster;
- changedAssignments++;
- }
- }
+ do
+ {
+ // We have two centroid matrices. We don't want to copy anything, so,
+ // depending on the iteration number, we use a different centroid matrix...
+ if (iteration % 2 == 0)
+ lloydStep.Iterate(centroids, centroidsOther, counts);
+ else
+ lloydStep.Iterate(centroidsOther, centroids, counts);
// If we are not allowing empty clusters, then check that all of our
// clusters have points.
@@ -220,9 +214,23 @@ Cluster(const MatType& data,
changedAssignments += emptyClusterAction.EmptyCluster(data, i,
centroids, counts, assignments);
+ // Calculate cluster distortion for this iteration.
+ cNorm = 0.0;
+ for (size_t i = 0; i < centroids.n_cols; ++i)
+ {
+ const double dist = metric.Evaluate(centroids.col(i),
+ centroidsOther.col(i));
+ cNorm += std::pow(dist, 2.0);
+ }
+ cNorm = sqrt(cNorm);
+
iteration++;
- } while (changedAssignments > 0 && iteration != maxIterations);
+ } while (cNorm > 1e-5 && iteration != maxIterations);
+
+ // Unfortunate copy that is sometimes necessary.
+ if (iteration % 2 == 0)
+ centroids = centroidsOther;
if (iteration != maxIterations)
{
@@ -233,15 +241,28 @@ Cluster(const MatType& data,
{
Log::Debug << "KMeans::Cluster(): terminated after limit of " << iteration
<< " iterations." << std::endl;
+ }
+
+ // Calculate final assignments.
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ // Find the closest centroid to this point.
+ double minDistance = std::numeric_limits<double>::infinity();
+ size_t closestCluster = centroids.n_cols; // Invalid value.
- // Recalculate final clusters.
- centroids.zeros();
+ for (size_t j = 0; j < centroids.n_cols; j++)
+ {
+ const double distance = metric.Evaluate(data.col(i), centroids.col(j));
- for (size_t i = 0; i < data.n_cols; i++)
- centroids.col(assignments[i]) += data.col(i);
+ if (distance < minDistance)
+ {
+ minDistance = distance;
+ closestCluster = j;
+ }
+ }
- for (size_t i = 0; i < actualClusters; i++)
- centroids.col(i) /= counts[i];
+ Log::Assert(closestCluster != centroids.n_cols);
+ assignments[i] = closestCluster;
}
// If we have overclustered, we need to merge the nearest clusters.
@@ -372,17 +393,21 @@ Cluster(const MatType& data,
template<typename MetricType,
typename InitialPartitionPolicy,
- typename EmptyClusterPolicy>
+ typename EmptyClusterPolicy,
+ template<class, class> class LloydStepType,
+ typename MatType>
std::string KMeans<MetricType,
InitialPartitionPolicy,
- EmptyClusterPolicy>::ToString() const
+ EmptyClusterPolicy,
+ LloydStepType,
+ MatType>::ToString() const
{
std::ostringstream convert;
convert << "KMeans [" << this << "]" << std::endl;
- convert << " Overclustering Factor: " << overclusteringFactor <<std::endl;
- convert << " Max Iterations: " << maxIterations <<std::endl;
+ convert << " Overclustering Factor: " << overclusteringFactor << std::endl;
+ convert << " Max Iterations: " << maxIterations << std::endl;
convert << " Metric: " << std::endl;
- convert << mlpack::util::Indent(metric.ToString(),2);
+ convert << mlpack::util::Indent(metric.ToString(), 2);
convert << std::endl;
return convert.str();
}
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
index 0715d01..af14ca1 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster.hpp
@@ -40,7 +40,7 @@ class MaxVarianceNewCluster
template<typename MatType>
static size_t EmptyCluster(const MatType& data,
const size_t emptyCluster,
- const MatType& centroids,
+ arma::mat& centroids,
arma::Col<size_t>& clusterCounts,
arma::Col<size_t>& assignments);
};
diff --git a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
index c97ef71..91b40f5 100644
--- a/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
+++ b/src/mlpack/methods/kmeans/max_variance_new_cluster_impl.hpp
@@ -19,7 +19,7 @@ namespace kmeans {
template<typename MatType>
size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
const size_t emptyCluster,
- const MatType& centroids,
+ arma::mat& centroids,
arma::Col<size_t>& clusterCounts,
arma::Col<size_t>& assignments)
{
@@ -67,10 +67,15 @@ size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
}
// Take that point and add it to the empty cluster.
- clusterCounts[maxVarCluster]--;
+ centroids.col(maxVarCluster) *= (clusterCounts[maxVarCluster] /
+ --clusterCounts[maxVarCluster]);
+ centroids.col(maxVarCluster) -= (1.0 / clusterCounts[maxVarCluster]) *
+ data.col(furthestPoint);
clusterCounts[emptyCluster]++;
+ centroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
assignments[furthestPoint] = emptyCluster;
+
// Output some debugging information.
Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
emptyCluster << ".\n";
diff --git a/src/mlpack/methods/kmeans/naive_kmeans.hpp b/src/mlpack/methods/kmeans/naive_kmeans.hpp
new file mode 100644
index 0000000..d0b986a
--- /dev/null
+++ b/src/mlpack/methods/kmeans/naive_kmeans.hpp
@@ -0,0 +1,45 @@
+/**
+ * @file naive_kmeans.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of a naively-implemented step of the Lloyd algorithm for
+ * k-means clustering. This may still be the best choice for small datasets or
+ * datasets with very high dimensionality.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_NAIVE_KMEANS_HPP
+#define __MLPACK_METHODS_KMEANS_NAIVE_KMEANS_HPP
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+class NaiveKMeans
+{
+ public:
+ NaiveKMeans(const MatType& dataset, MetricType& metric);
+
+ /**
+ * Run a single iteration of the Lloyd algorithm, updating the given centroids
+ * into the newCentroids matrix.
+ *
+ * @param centroids Current cluster centroids.
+ * @param newCentroids New cluster centroids.
+ */
+ void Iterate(const arma::mat& centroids,
+ arma::mat& newCentroids,
+ arma::Col<size_t>& counts);
+
+ private:
+ //! The dataset.
+ const MatType& dataset;
+ //! The instantiated metric.
+ MetricType& metric;
+};
+
+} // namespace kmeans
+} // namespace mlpack
+
+// Include implementation.
+#include "naive_kmeans_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/kmeans/naive_kmeans_impl.hpp b/src/mlpack/methods/kmeans/naive_kmeans_impl.hpp
new file mode 100644
index 0000000..4e9163c
--- /dev/null
+++ b/src/mlpack/methods/kmeans/naive_kmeans_impl.hpp
@@ -0,0 +1,68 @@
+/**
+ * @file naive_kmeans_impl.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of a naively-implemented step of the Lloyd algorithm for
+ * k-means clustering. This may still be the best choice for small datasets or
+ * datasets with very high dimensionality.
+ */
+#ifndef __MLPACK_METHODS_KMEANS_NAIVE_KMEANS_IMPL_HPP
+#define __MLPACK_METHODS_KMEANS_NAIVE_KMEANS_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "naive_kmeans.hpp"
+
+namespace mlpack {
+namespace kmeans {
+
+template<typename MetricType, typename MatType>
+NaiveKMeans<MetricType, MatType>::NaiveKMeans(const MatType& dataset,
+ MetricType& metric) :
+ dataset(dataset),
+ metric(metric)
+{ /* Nothing to do. */ }
+
+// Run a single iteration.
+template<typename MetricType, typename MatType>
+void NaiveKMeans<MetricType, MatType>::Iterate(const arma::mat& centroids,
+ arma::mat& newCentroids,
+ arma::Col<size_t>& counts)
+{
+ newCentroids.zeros(centroids.n_rows, centroids.n_cols);
+ counts.zeros(centroids.n_cols);
+
+ // Find the closest centroid to each point and update the new centroids.
+ for (size_t i = 0; i < dataset.n_cols; i++)
+ {
+ // Find the closest centroid to this point.
+ double minDistance = std::numeric_limits<double>::infinity();
+ size_t closestCluster = centroids.n_cols; // Invalid value.
+
+ for (size_t j = 0; j < centroids.n_cols; j++)
+ {
+ const double distance = metric.Evaluate(dataset.col(i), centroids.col(j));
+
+ if (distance < minDistance)
+ {
+ minDistance = distance;
+ closestCluster = j;
+ }
+ }
+
+ Log::Assert(closestCluster != centroids.n_cols);
+
+ // We now have the minimum distance centroid index. Update that centroid.
+ newCentroids.col(closestCluster) += dataset.col(i);
+ counts(closestCluster)++;
+ }
+
+ // Now normalize the centroid.
+ for (size_t i = 0; i < centroids.n_cols; ++i)
+ if (counts(i) != 0)
+ newCentroids.col(i) /= counts(i);
+}
+
+} // namespace kmeans
+} // namespace mlpack
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list