[mlpack] 01/10: Significant DBSCAN refactoring and improvements.

Wed Jul 26 21:50:55 UTC 2017

This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit 352f6fe2f2089265fc45b1639d24cbd43fb3a49e
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Jun 14 17:02:44 2017 -0400

    Significant DBSCAN refactoring and improvements.
    
     - Use PARAM_MATRIX() instead of strings.
     - Add a single-point mode that handles RAM better.
     - Use UnionFind for much more efficient cluster finding.
     - Make --single_mode use the single point mode.
---
 src/mlpack/methods/dbscan/dbscan.hpp      |  56 ++++++------
 src/mlpack/methods/dbscan/dbscan_impl.hpp | 138 +++++++++++++++++-------------
 src/mlpack/methods/dbscan/dbscan_main.cpp |  12 +--
 src/mlpack/tests/dbscan_test.cpp          | 125 +++++++++++++++++++++++++++
 4 files changed, 243 insertions(+), 88 deletions(-)

diff --git a/src/mlpack/methods/dbscan/dbscan.hpp b/src/mlpack/methods/dbscan/dbscan.hpp
index e35d7f9..c48d1d4 100644
--- a/src/mlpack/methods/dbscan/dbscan.hpp
+++ b/src/mlpack/methods/dbscan/dbscan.hpp
@@ -15,6 +15,7 @@
 
 #include <mlpack/core.hpp>
 #include <mlpack/methods/range_search/range_search.hpp>
+#include <mlpack/methods/emst/union_find.hpp>
 #include "random_point_selection.hpp"
 #include <boost/dynamic_bitset.hpp>
 
@@ -52,15 +53,21 @@ class DBSCAN
 {
  public:
   /**
-   * Construct the DBSCAN object with the given parameters.
+   * Construct the DBSCAN object with the given parameters.  The batchMode
+   * parameter should be set to false in the case where RAM issues will be
+   * encountered (i.e. if the dataset is very large or if epsilon is large).
+   * When batchMode is false, each point will be searched iteratively, which
+   * could be slower but will use less memory.
    *
    * @param epsilon Size of range query.
    * @param minPoints Minimum number of points for each cluster.
+   * @param batchMode If true, all points are searched in batch.
    * @param rangeSearch Optional instantiated RangeSearch object.
    * @param pointSelector OptionL instantiated PointSelectionPolicy object.
    */
   DBSCAN(const double epsilon,
          const size_t minPoints,
+         const bool batchMode = true,
          RangeSearchType rangeSearch = RangeSearchType(),
          PointSelectionPolicy pointSelector = PointSelectionPolicy());
 
@@ -112,6 +119,9 @@ class DBSCAN
   //! itself) for the point to be a core-point.
   size_t minPoints;
 
+  //! Whether or not to perform the search in batch mode.  If false, single
+  bool batchMode;
+
   //! Instantiated range search policy.
   RangeSearchType rangeSearch;
 
@@ -119,33 +129,31 @@ class DBSCAN
   PointSelectionPolicy pointSelector;
 
   /**
-   * This function processes the point at index. It  marks the point as visited,
-   * checks if the given point is core or non-core.  If it is a core point, it
-   * expands the cluster, otherwise it returns.
+   * Performs DBSCAN clustering on the data, returning the number of clusters and
+   * also the list of cluster assignments.  This searches each point iteratively,
+   * and can save on RAM usage.  It may be slower than the batch search with a
+   * dual-tree algorithm.
    *
-   * @tparam MatType Type of matrix (arma::mat or arma::sp_mat).
    * @param data Dataset to cluster.
-   * @param unvisited Remembers if a point has been visited.
-   * @param index Index of point to be visited now.
-   * @param assignments Vector to store cluster assignments.
-   * @param currentCluster Index of cluster which will be  assigned to points in
-   *     current cluster.
-   * @param neighbors Matrix containing list of neighbors for each point which
-   *     fall in its epsilon-neighborhood.
-   * @param distances Matrix containing list of distances for each point which
-   *     fall in its epsilon-neighborhood.
-   * @param topLevel If true, then current point is the first point in the
-   *     current cluster, helps in detecting noise.
+   * @param assignments Assignments for each point.
+   * @param uf UnionFind structure that will be modified.
+   */
+  template<typename MatType>
+  void PointwiseCluster(const MatType& data,
+                        emst::UnionFind& uf);
+
+  /**
+   * Performs DBSCAN clustering on the data, returning number of clusters
+   * and also the list of cluster assignments.  This can perform search in batch,
+   * so it is well suited for dual-tree or naive search.
+   *
+   * @param data Dataset to cluster.
+   * @param assignments Assignments for each point.
+   * @param uf UnionFind structure that will be modified.
    */
   template<typename MatType>
-  size_t ProcessPoint(const MatType& data,
-                      boost::dynamic_bitset<>& unvisited,
-                      const size_t index,
-                      arma::Row<size_t>& assignments,
-                      const size_t currentCluster,
-                      const std::vector<std::vector<size_t>>& neighbors,
-                      const std::vector<std::vector<double>>& distances,
-                      const bool topLevel = true);
+  void BatchCluster(const MatType& data,
+                    emst::UnionFind& uf);
 };
 
 } // namespace dbscan
diff --git a/src/mlpack/methods/dbscan/dbscan_impl.hpp b/src/mlpack/methods/dbscan/dbscan_impl.hpp
index 91e0b82..f485871 100644
--- a/src/mlpack/methods/dbscan/dbscan_impl.hpp
+++ b/src/mlpack/methods/dbscan/dbscan_impl.hpp
@@ -24,10 +24,12 @@ template<typename RangeSearchType, typename PointSelectionPolicy>
 DBSCAN<RangeSearchType, PointSelectionPolicy>::DBSCAN(
     const double epsilon,
     const size_t minPoints,
+    const bool batchMode,
     RangeSearchType rangeSearch,
     PointSelectionPolicy pointSelector) :
     epsilon(epsilon),
     minPoints(minPoints),
+    batchMode(batchMode),
     rangeSearch(rangeSearch),
     pointSelector(pointSelector)
 {
@@ -89,8 +91,8 @@ size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::Cluster(
 }
 
 /**
- * Performs DBSCAN clustering on the data, returning number of clusters 
- * and also the list of cluster assignments.
+ * Performs DBSCAN clustering on the data, returning the number of clusters and
+ * also the list of cluster assignments.
  */
 template<typename RangeSearchType, typename PointSelectionPolicy>
 template<typename MatType>
@@ -98,83 +100,101 @@ size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::Cluster(
     const MatType& data,
     arma::Row<size_t>& assignments)
 {
-  assignments.set_size(data.n_cols);
-  assignments.fill(SIZE_MAX);
+  // Initialize the UnionFind object.
+  emst::UnionFind uf(data.n_cols);
+  rangeSearch.Train(data);
 
-  size_t currentCluster = 0;
+  if (batchMode)
+    BatchCluster(data, uf);
+  else
+    PointwiseCluster(data, uf);
 
-  // For each point, find the points in epsilon-nighborhood and their distances.
-  std::vector<std::vector<size_t>> neighbors;
-  std::vector<std::vector<double>> distances;
-  Log::Debug << "Performing range search." << std::endl;
-  rangeSearch.Train(data);
-  rangeSearch.Search(data, math::Range(0.0, epsilon), neighbors, distances);
-  Log::Debug << "Range search complete." << std::endl;
+  // Now set assignments.
+  assignments.set_size(data.n_cols);
+  for (size_t i = 0; i < data.n_cols; ++i)
+    assignments[i] = uf.Find(i);
+
+  // Get a count of all clusters.
+  const size_t numClusters = arma::max(assignments) + 1;
+  arma::Col<size_t> counts(numClusters, arma::fill::zeros);
+  for (size_t i = 0; i < assignments.n_elem; ++i)
+    counts[assignments[i]]++;
 
-  // Initialize to all true; false means it's been visited.
-  boost::dynamic_bitset<> unvisited(data.n_cols);
-  unvisited.set();
-  while (unvisited.any())
+  // Now assign clusters to new indices.
+  size_t currentCluster = 0;
+  arma::Col<size_t> newAssignments(numClusters);
+  for (size_t i = 0; i < counts.n_elem; ++i)
   {
-    // Select the next unvisited point.
-    const size_t nextIndex = pointSelector.Select(unvisited, data);
-    Log::Debug << "Inspect point " << nextIndex << "; " << unvisited.count()
-        << " unvisited points remain." << std::endl;
-
-    // currentCluster will only be incremented if a cluster was created.
-    currentCluster = ProcessPoint(data, unvisited, nextIndex, assignments,
-        currentCluster, neighbors, distances);
+    if (counts[i] >= minPoints)
+      newAssignments[i] = currentCluster++;
+    else
+      newAssignments[i] = SIZE_MAX;
   }
 
+  // Now reassign.
+  for (size_t i = 0; i < assignments.n_elem; ++i)
+    assignments[i] = newAssignments[assignments[i]];
+
+  Log::Info << currentCluster << " clusters found." << std::endl;
+
   return currentCluster;
 }
 
 /**
- * This function processes the point at index. It marks the point as visited,
- * checks if the given point is core or non-core. If it is a core point, it
- * expands the cluster, otherwise it returns.
+ * Performs DBSCAN clustering on the data, returning the number of clusters and
+ * also the list of cluster assignments.  This searches each point iteratively,
+ * and can save on RAM usage.  It may be slower than the batch search with a
+ * dual-tree algorithm.
  */
 template<typename RangeSearchType, typename PointSelectionPolicy>
 template<typename MatType>
-size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::ProcessPoint(
+void DBSCAN<RangeSearchType, PointSelectionPolicy>::PointwiseCluster(
     const MatType& data,
-    boost::dynamic_bitset<>& unvisited,
-    const size_t index,
-    arma::Row<size_t>& assignments,
-    const size_t currentCluster,
-    const std::vector<std::vector<size_t>>& neighbors,
-    const std::vector<std::vector<double>>& distances,
-    const bool topLevel)
+    emst::UnionFind& uf)
 {
-  // We've now visited this point.
-  unvisited[index] = false;
+  std::vector<std::vector<size_t>> neighbors;
+  std::vector<std::vector<double>> distances;
 
-  if ((neighbors[index].size() < minPoints) && topLevel)
-  {
-    // Mark the point as noise (leave assignments[index] unset) and return.
-    unvisited[index] = false;
-    return currentCluster;
-  }
-  else
+  for (size_t i = 0; i < data.n_cols; ++i)
   {
-    assignments[index] = currentCluster;
+    if (i % 10000 == 0 && i > 0)
+      Log::Info << "DBSCAN clustering on point " << i << "..." << std::endl;
 
-    // New cluster.
-    for (size_t j = 0; j < neighbors[index].size(); ++j)
-    {
-      // Add each point to the cluster and mark it as visited, but only if it
-      // has not been visited yet.
-      if (!unvisited[neighbors[index][j]])
-        continue;
+    // Do the range search for only this point.
+    rangeSearch.Search(data.col(i), math::Range(0.0, epsilon), neighbors,
+        distances);
 
-      assignments[neighbors[index][j]] = currentCluster;
-      unvisited[neighbors[index][j]] = false;
+    // Union to all neighbors.
+    for (size_t j = 0; j < neighbors[0].size(); ++j)
+      uf.Union(i, neighbors[0][j]);
+  }
+}
 
-      // Recurse into this point.
-      ProcessPoint(data, unvisited, neighbors[index][j], assignments,
-          currentCluster, neighbors, distances, false);
-    }
-    return currentCluster + 1;
+/**
+ * Performs DBSCAN clustering on the data, returning number of clusters
+ * and also the list of cluster assignments.  This can perform search in batch,
+ * naive search).
+ */
+template<typename RangeSearchType, typename PointSelectionPolicy>
+template<typename MatType>
+void DBSCAN<RangeSearchType, PointSelectionPolicy>::BatchCluster(
+    const MatType& data,
+    emst::UnionFind& uf)
+{
+  // For each point, find the points in epsilon-nighborhood and their distances.
+  std::vector<std::vector<size_t>> neighbors;
+  std::vector<std::vector<double>> distances;
+  Log::Info << "Performing range search." << std::endl;
+  rangeSearch.Train(data);
+  rangeSearch.Search(data, math::Range(0.0, epsilon), neighbors, distances);
+  Log::Info << "Range search complete." << std::endl;
+
+  // Now loop over all points.
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    // Union to all neighbors.
+    for (size_t j = 0; j < neighbors[i].size(); ++j)
+      uf.Union(i, neighbors[i][j]);
   }
 }
 
diff --git a/src/mlpack/methods/dbscan/dbscan_main.cpp b/src/mlpack/methods/dbscan/dbscan_main.cpp
index 1a02264..917df10 100644
--- a/src/mlpack/methods/dbscan/dbscan_main.cpp
+++ b/src/mlpack/methods/dbscan/dbscan_main.cpp
@@ -43,15 +43,15 @@ PROGRAM_INFO("DBSCAN clustering",
     " tree used for range search; this can take a variety of values: 'kd', 'r',"
     " 'r-star', 'x', 'hilbert-r', 'r-plus', 'r-plus-plus', 'cover', 'ball'. "
     "The --single_mode option will force single-tree search (as opposed to the "
-    "default dual-tree search), and --naive will force brute-force range "
-    "search."
+    "default dual-tree search).  --single_mode can be useful when the RAM usage"
+    " of batch search is too high.  The --naive option will force brute-force "
+    "range search."
     "\n\n"
     "An example usage to run DBSCAN on the dataset in input.csv with a radius "
     "of 0.5 and a minimum cluster size of 5 is given below:"
     "\n\n"
     "  $ mlpack_dbscan -i input.csv -e 0.5 -m 5");
 
-
 PARAM_STRING_IN_REQ("input_file", "Input dataset to cluster.", "i");
 PARAM_STRING_OUT("assignments_file", "Output file for assignments of each "
     "point.", "a");
@@ -82,7 +82,8 @@ void RunDBSCAN(RangeSearchType rs = RangeSearchType())
   const double epsilon = CLI::GetParam<double>("epsilon");
   const size_t minSize = (size_t) CLI::GetParam<int>("min_size");
 
-  DBSCAN<RangeSearchType> d(epsilon, minSize, rs);
+  DBSCAN<RangeSearchType> d(epsilon, minSize, !CLI::HasParam("single_mode"),
+      rs);
 
   // If possible, avoid the overhead of calculating centroids.
   arma::Row<size_t> assignments;
@@ -100,7 +101,8 @@ void RunDBSCAN(RangeSearchType rs = RangeSearchType())
   }
 
   if (CLI::HasParam("assignments_file"))
-    data::Save(CLI::GetParam<string>("assignments_file"), assignments, false);
+    data::Save(CLI::GetParam<string>("assignments_file"), assignments, false,
+        false /* no transpose */);
 }
 
 int main(int argc, char** argv)
diff --git a/src/mlpack/tests/dbscan_test.cpp b/src/mlpack/tests/dbscan_test.cpp
index 3e8b5ff..cd79809 100644
--- a/src/mlpack/tests/dbscan_test.cpp
+++ b/src/mlpack/tests/dbscan_test.cpp
@@ -146,4 +146,129 @@ BOOST_AUTO_TEST_CASE(GaussiansTest)
   }
 }
 
+BOOST_AUTO_TEST_CASE(OneClusterSingleModeTest)
+{
+  // Make sure that if we have points in the unit box, and if we set epsilon
+  // large enough, all points end up as in one cluster.
+  arma::mat points(10, 200, arma::fill::randu);
+
+  DBSCAN<> d(2.0, 2, false);
+
+  arma::Row<size_t> assignments;
+  const size_t clusters = d.Cluster(points, assignments);
+
+  BOOST_REQUIRE_EQUAL(clusters, 1);
+  BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+  for (size_t i = 0; i < assignments.n_elem; ++i)
+    BOOST_REQUIRE_EQUAL(assignments[i], 0);
+}
+
+/**
+ * When epsilon is small enough, every point returned should be noise.
+ */
+BOOST_AUTO_TEST_CASE(TinyEpsilonSingleModeTest)
+{
+  arma::mat points(10, 200, arma::fill::randu);
+
+  DBSCAN<> d(1e-50, 2, false);
+
+  arma::Row<size_t> assignments;
+  const size_t clusters = d.Cluster(points, assignments);
+
+  BOOST_REQUIRE_EQUAL(clusters, 0);
+  BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+  for (size_t i = 0; i < assignments.n_elem; ++i)
+    BOOST_REQUIRE_EQUAL(assignments[i], SIZE_MAX);
+}
+
+/**
+ * Check that outliers are properly labeled as noise.
+ */
+BOOST_AUTO_TEST_CASE(OutlierSingleModeTest)
+{
+  arma::mat points(2, 200, arma::fill::randu);
+
+  // Add 3 outliers.
+  points.col(15) = arma::vec("10.3 1.6");
+  points.col(45) = arma::vec("-100 0.0");
+  points.col(101) = arma::vec("1.5 1.5");
+
+  DBSCAN<> d(0.1, 3, false);
+
+  arma::Row<size_t> assignments;
+  const size_t clusters = d.Cluster(points, assignments);
+
+  BOOST_REQUIRE_GT(clusters, 0);
+  BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+  BOOST_REQUIRE_EQUAL(assignments[15], SIZE_MAX);
+  BOOST_REQUIRE_EQUAL(assignments[45], SIZE_MAX);
+  BOOST_REQUIRE_EQUAL(assignments[101], SIZE_MAX);
+}
+
+/**
+ * Check that the Gaussian clusters are correctly found.
+ */
+BOOST_AUTO_TEST_CASE(GaussiansSingleModeTest)
+{
+  arma::mat points(3, 300);
+
+  GaussianDistribution g1(3), g2(3), g3(3);
+  g1.Mean() = arma::vec("0.0 0.0 0.0");
+  g2.Mean() = arma::vec("6.0 6.0 8.0");
+  g3.Mean() = arma::vec("-6.0 1.0 -7.0");
+  for (size_t i = 0; i < 100; ++i)
+    points.col(i) = g1.Random();
+  for (size_t i = 100; i < 200; ++i)
+    points.col(i) = g2.Random();
+  for (size_t i = 200; i < 300; ++i)
+    points.col(i) = g3.Random();
+
+  DBSCAN<> d(2.0, 3);
+
+  arma::Row<size_t> assignments;
+  arma::mat centroids;
+  const size_t clusters = d.Cluster(points, assignments, centroids);
+  BOOST_REQUIRE_EQUAL(clusters, 3);
+
+  // Our centroids should be close to one of our Gaussians.
+  arma::Row<size_t> matches(3);
+  matches.fill(3);
+  for (size_t j = 0; j < 3; ++j)
+  {
+    if (arma::norm(g1.Mean() - centroids.col(j)) < 3.0)
+      matches(0) = j;
+    else if (arma::norm(g2.Mean() - centroids.col(j)) < 3.0)
+      matches(1) = j;
+    else if (arma::norm(g3.Mean() - centroids.col(j)) < 3.0)
+      matches(2) = j;
+  }
+
+  BOOST_REQUIRE_NE(matches(0), matches(1));
+  BOOST_REQUIRE_NE(matches(1), matches(2));
+  BOOST_REQUIRE_NE(matches(2), matches(0));
+
+  BOOST_REQUIRE_NE(matches(0), 3);
+  BOOST_REQUIRE_NE(matches(1), 3);
+  BOOST_REQUIRE_NE(matches(2), 3);
+
+  for (size_t i = 0; i < 100; ++i)
+  {
+    // Each point should either be noise or in cluster matches(0).
+    BOOST_REQUIRE_NE(assignments(i), matches(1));
+    BOOST_REQUIRE_NE(assignments(i), matches(2));
+  }
+
+  for (size_t i = 100; i < 200; ++i)
+  {
+    BOOST_REQUIRE_NE(assignments(i), matches(0));
+    BOOST_REQUIRE_NE(assignments(i), matches(2));
+  }
+
+  for (size_t i = 200; i < 300; ++i)
+  {
+    BOOST_REQUIRE_NE(assignments(i), matches(0));
+    BOOST_REQUIRE_NE(assignments(i), matches(1));
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END();

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git