[mlpack] 67/207: A first pass at DBSCAN. But it doesn't pass the tests.

Barak A. Pearlmutter barak+git at pearlmutter.net
Thu Mar 23 17:53:41 UTC 2017


This is an automated email from the git hooks/post-receive script.

bap pushed a commit to branch master
in repository mlpack.

commit 22db8b7c1d0256852a738985b85141b4e87af85c
Author: Ryan Curtin <ryan at ratml.org>
Date:   Wed Aug 3 17:31:44 2016 -0400

    A first pass at DBSCAN.  But it doesn't pass the tests.
---
 src/mlpack/methods/dbscan/CMakeLists.txt           |  16 +++
 src/mlpack/methods/dbscan/dbscan.hpp               |  71 ++++++++++
 src/mlpack/methods/dbscan/dbscan_impl.hpp          | 143 +++++++++++++++++++++
 .../methods/dbscan/random_point_selection.hpp      |  48 +++++++
 src/mlpack/tests/dbscan_test.cpp                   |  53 ++++++++
 5 files changed, 331 insertions(+)

diff --git a/src/mlpack/methods/dbscan/CMakeLists.txt b/src/mlpack/methods/dbscan/CMakeLists.txt
new file mode 100644
index 0000000..92ecf71
--- /dev/null
+++ b/src/mlpack/methods/dbscan/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Define the files we need to compile
+# Anything not in this list will not be compiled into mlpack.
+set(SOURCES
+  dbscan.hpp
+  dbscan_impl.hpp
+  random_point_selection.hpp
+)
+
+# Add directory name to sources.
+set(DIR_SRCS)
+foreach(file ${SOURCES})
+  set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
+endforeach()
+# Append sources (with directory name) to list of all mlpack sources (used at
+# the parent scope).
+set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
diff --git a/src/mlpack/methods/dbscan/dbscan.hpp b/src/mlpack/methods/dbscan/dbscan.hpp
new file mode 100644
index 0000000..6d76128
--- /dev/null
+++ b/src/mlpack/methods/dbscan/dbscan.hpp
@@ -0,0 +1,71 @@
+/**
+ * @file dbscan.hpp
+ * @author Ryan Curtin
+ *
+ * An implementation of the DBSCAN clustering method, which is flexible enough
+ * to support other algorithms for finding nearest neighbors.
+ */
+#ifndef __MLPACK_METHODS_DBSCAN_DBSCAN_HPP
+#define __MLPACK_METHODS_DBSCAN_DBSCAN_HPP
+
+#include <mlpack/core.hpp>
+#include <mlpack/methods/range_search/range_search.hpp>
+#include "random_point_selection.hpp"
+#include <boost/dynamic_bitset.hpp>
+
+namespace mlpack {
+namespace dbscan {
+
+template<typename RangeSearchType = range::RangeSearch<>,
+         typename PointSelectionPolicy = RandomPointSelection>
+class DBSCAN
+{
+ public:
+  /**
+   * Construct the DBSCAN object with the given parameters.
+   *
+   * @param epsilon Size of range query.
+   * @param minPoints Minimum number of points for each cluster.
+   */
+  DBSCAN(const double epsilon,
+         const size_t minPoints);
+
+  template<typename MatType>
+  size_t Cluster(const MatType& data,
+                 arma::mat& centroids);
+
+  template<typename MatType>
+  size_t Cluster(const MatType& data,
+                 arma::Row<size_t>& assignments);
+
+  //! If assignments[i] == assignments.n_elem - 1, then the point is considered
+  //! "noise".
+  template<typename MatType>
+  size_t Cluster(const MatType& data,
+                 arma::Row<size_t>& assignments,
+                 arma::mat& centroids);
+
+ private:
+  RangeSearchType rangeSearch;
+  PointSelectionPolicy pointSelector;
+  double epsilon;
+  size_t minPoints;
+
+  template<typename MatType>
+  void ProcessPoint(const MatType& data,
+                    boost::dynamic_bitset<>& unvisited,
+                    const size_t index,
+                    arma::Row<size_t>& assignments,
+                    const size_t currentCluster,
+                    const std::vector<std::vector<size_t>>& neighbors,
+                    const std::vector<std::vector<double>>& distances,
+                    const bool topLevel = true);
+};
+
+} // namespace dbscan
+} // namespace mlpack
+
+// Include implementation.
+#include "dbscan_impl.hpp"
+
+#endif
diff --git a/src/mlpack/methods/dbscan/dbscan_impl.hpp b/src/mlpack/methods/dbscan/dbscan_impl.hpp
new file mode 100644
index 0000000..32e6932
--- /dev/null
+++ b/src/mlpack/methods/dbscan/dbscan_impl.hpp
@@ -0,0 +1,143 @@
+/**
+ * @file dbscan_impl.hpp
+ * @author Ryan Curtin
+ *
+ * Implementation of DBSCAN.
+ */
+#ifndef __MLPACK_METHODS_DBSCAN_DBSCAN_IMPL_HPP
+#define __MLPACK_METHODS_DBSCAN_DBSCAN_IMPL_HPP
+
+#include "dbscan.hpp"
+
+namespace mlpack {
+namespace dbscan {
+
+template<typename RangeSearchType, typename PointSelectionPolicy>
+DBSCAN<RangeSearchType, PointSelectionPolicy>::DBSCAN(const double epsilon,
+                                                      const size_t minPoints) :
+    epsilon(epsilon),
+    minPoints(minPoints)
+{
+  // Nothing to do.
+}
+
+template<typename RangeSearchType, typename PointSelectionPolicy>
+template<typename MatType>
+size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::Cluster(
+    const MatType& data,
+    arma::mat& centroids)
+{
+  // These assignments will be thrown away, but there is no way to avoid
+  // calculating them.
+  arma::Row<size_t> assignments(data.n_cols);
+  assignments.fill(SIZE_MAX);
+
+  return Cluster(data, assignments, centroids);
+}
+
+template<typename RangeSearchType, typename PointSelectionPolicy>
+template<typename MatType>
+size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::Cluster(
+    const MatType& data,
+    arma::Row<size_t>& assignments,
+    arma::mat& centroids)
+{
+  const size_t numClusters = Cluster(data, assignments);
+
+  // Now calculate the centroids.
+  centroids.zeros(data.n_rows, numClusters);
+
+  arma::Row<size_t> counts;
+  counts.zeros(numClusters);
+
+  for (size_t i = 0; i < data.n_cols; ++i)
+  {
+    if (assignments[i] != SIZE_MAX)
+    {
+      centroids.col(assignments[i]) += data.col(i);
+      ++counts[assignments[i]];
+    }
+  }
+
+  // We should be guaranteed that the number of clusters is always greater than
+  // zero.
+  for (size_t i = 0; i < numClusters; ++i)
+    centroids.col(i) /= counts[i];
+
+  return numClusters;
+}
+
+template<typename RangeSearchType, typename PointSelectionPolicy>
+template<typename MatType>
+size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::Cluster(
+    const MatType& data,
+    arma::Row<size_t>& assignments)
+{
+  assignments.set_size(data.n_cols);
+  assignments.fill(SIZE_MAX);
+
+  size_t currentCluster = 0;
+
+  std::vector<std::vector<size_t>> neighbors;
+  std::vector<std::vector<double>> distances;
+  rangeSearch.Train(data);
+  rangeSearch.Search(data, math::Range(0.0, epsilon), neighbors, distances);
+
+  // Initialize to all true; false means it's been visited.
+  boost::dynamic_bitset<> unvisited(data.n_cols, 1);
+  while (unvisited.any())
+  {
+    const size_t nextIndex = pointSelector.Select(unvisited, data);
+
+    ProcessPoint(data, unvisited, nextIndex, assignments, currentCluster,
+        neighbors, distances);
+    ++currentCluster;
+  }
+
+  return currentCluster;
+}
+
+template<typename RangeSearchType, typename PointSelectionPolicy>
+template<typename MatType>
+void DBSCAN<RangeSearchType, PointSelectionPolicy>::ProcessPoint(
+    const MatType& data,
+    boost::dynamic_bitset<>& unvisited,
+    const size_t index,
+    arma::Row<size_t>& assignments,
+    const size_t currentCluster,
+    const std::vector<std::vector<size_t>>& neighbors,
+    const std::vector<std::vector<double>>& distances,
+    const bool topLevel)
+{
+  // We've now visited this point.
+  unvisited[index] = false;
+
+  if ((neighbors[index].size() < minPoints) && topLevel)
+  {
+    // Mark the point as noise (leave assignments[index] unset) and return.
+    unvisited[index] = false;
+  }
+  else
+  {
+    // New cluster.
+    for (size_t j = 0; j < neighbors[index].size(); ++j)
+    {
+      // Add each point to the cluster and mark it as visited, but only if it
+      // has not been visited yet.
+      if (!unvisited[neighbors[index][j]])
+        continue;
+
+      assignments[neighbors[index][j]] = currentCluster;
+      unvisited[neighbors[index][j]] = false;
+
+      // Recurse into this point.
+      ProcessPoint(data, unvisited, neighbors[index][j], assignments,
+          currentCluster, neighbors, distances, false);
+    }
+  }
+}
+
+} // namespace dbscan
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/methods/dbscan/random_point_selection.hpp b/src/mlpack/methods/dbscan/random_point_selection.hpp
new file mode 100644
index 0000000..0ee2874
--- /dev/null
+++ b/src/mlpack/methods/dbscan/random_point_selection.hpp
@@ -0,0 +1,48 @@
+/**
+ * @file random_point_selection.hpp
+ * @author Ryan Curtin
+ *
+ * Randomly select the next point for DBSCAN.
+ */
+#ifndef MLPACK_METHODS_DBSCAN_RANDOM_POINT_SELECTION_HPP
+#define MLPACK_METHODS_DBSCAN_RANDOM_POINT_SELECTION_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <boost/dynamic_bitset.hpp>
+
+namespace mlpack {
+namespace dbscan {
+
+class RandomPointSelection
+{
+ public:
+  /**
+   * Select the next point to use, randomly.
+   */
+  template<typename MatType>
+  static size_t Select(const boost::dynamic_bitset<>& unvisited,
+                       const MatType& /* data */)
+  {
+    const size_t max = unvisited.count();
+    const size_t index = math::RandInt(max);
+
+    // Select the index'th unvisited point.
+    size_t i = 0;
+    size_t found = 0;
+    for (size_t i = 0; i < unvisited.size(); ++i)
+    {
+      if (unvisited[i])
+        ++found;
+
+      if (found == index)
+        break;
+    }
+
+    return i;
+  }
+};
+
+} // namespace dbscan
+} // namespace mlpack
+
+#endif
diff --git a/src/mlpack/tests/dbscan_test.cpp b/src/mlpack/tests/dbscan_test.cpp
new file mode 100644
index 0000000..7c4ab9f
--- /dev/null
+++ b/src/mlpack/tests/dbscan_test.cpp
@@ -0,0 +1,53 @@
+/**
+ * @file dbscan_test.cpp
+ * @author Ryan Curtin
+ *
+ * Test the DBSCAN implementation.
+ */
+#include <mlpack/core.hpp>
+#include <mlpack/methods/dbscan/dbscan.hpp>
+
+#include <boost/test/unit_test.hpp>
+#include "test_tools.hpp"
+
+using namespace mlpack;
+using namespace mlpack::dbscan;
+
+BOOST_AUTO_TEST_SUITE(DBSCANTest);
+
+BOOST_AUTO_TEST_CASE(OneClusterTest)
+{
+  // Make sure that if we have points in the unit box, and if we set epsilon
+  // large enough, all points end up as in one cluster.
+  arma::mat points(10, 1000, arma::fill::randu);
+
+  DBSCAN<> d(2.0, 2);
+
+  arma::Row<size_t> assignments;
+  const size_t clusters = d.Cluster(points, assignments);
+
+  BOOST_REQUIRE_EQUAL(clusters, 1);
+  BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+  for (size_t i = 0; i < assignments.n_elem; ++i)
+    BOOST_REQUIRE_EQUAL(assignments[i], 0);
+}
+
+/**
+ * When epsilon is small enough, every point returned should be noise.
+ */
+BOOST_AUTO_TEST_CASE(TinyEpsilonTest)
+{
+  arma::mat points(10, 1000, arma::fill::randu);
+
+  DBSCAN<> d(1e-50, 2);
+
+  arma::Row<size_t> assignments;
+  const size_t clusters = d.Cluster(points, assignments);
+
+  BOOST_REQUIRE_EQUAL(clusters, 0);
+  BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+  for (size_t i = 0; i < assignments.n_elem; ++i)
+    BOOST_REQUIRE_EQUAL(assignments[i], SIZE_MAX);
+}
+
+BOOST_AUTO_TEST_SUITE_END();

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git



More information about the debian-science-commits mailing list