[mlpack] 01/10: Significant DBSCAN refactoring and improvements.
Barak A. Pearlmutter
barak+git at pearlmutter.net
Wed Jul 26 21:50:55 UTC 2017
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch master
in repository mlpack.
commit 352f6fe2f2089265fc45b1639d24cbd43fb3a49e
Author: Ryan Curtin <ryan at ratml.org>
Date: Wed Jun 14 17:02:44 2017 -0400
Significant DBSCAN refactoring and improvements.
- Use PARAM_MATRIX() instead of strings.
- Add a single-point mode that handles RAM better.
- Use UnionFind for much more efficient cluster finding.
- Make --single_mode use the single point mode.
---
src/mlpack/methods/dbscan/dbscan.hpp | 56 ++++++------
src/mlpack/methods/dbscan/dbscan_impl.hpp | 138 +++++++++++++++++-------------
src/mlpack/methods/dbscan/dbscan_main.cpp | 12 +--
src/mlpack/tests/dbscan_test.cpp | 125 +++++++++++++++++++++++++++
4 files changed, 243 insertions(+), 88 deletions(-)
diff --git a/src/mlpack/methods/dbscan/dbscan.hpp b/src/mlpack/methods/dbscan/dbscan.hpp
index e35d7f9..c48d1d4 100644
--- a/src/mlpack/methods/dbscan/dbscan.hpp
+++ b/src/mlpack/methods/dbscan/dbscan.hpp
@@ -15,6 +15,7 @@
#include <mlpack/core.hpp>
#include <mlpack/methods/range_search/range_search.hpp>
+#include <mlpack/methods/emst/union_find.hpp>
#include "random_point_selection.hpp"
#include <boost/dynamic_bitset.hpp>
@@ -52,15 +53,21 @@ class DBSCAN
{
public:
/**
- * Construct the DBSCAN object with the given parameters.
+ * Construct the DBSCAN object with the given parameters. The batchMode
+ * parameter should be set to false in the case where RAM issues will be
+ * encountered (i.e. if the dataset is very large or if epsilon is large).
+ * When batchMode is false, each point will be searched iteratively, which
+ * could be slower but will use less memory.
*
* @param epsilon Size of range query.
* @param minPoints Minimum number of points for each cluster.
+ * @param batchMode If true, all points are searched in batch.
* @param rangeSearch Optional instantiated RangeSearch object.
* @param pointSelector OptionL instantiated PointSelectionPolicy object.
*/
DBSCAN(const double epsilon,
const size_t minPoints,
+ const bool batchMode = true,
RangeSearchType rangeSearch = RangeSearchType(),
PointSelectionPolicy pointSelector = PointSelectionPolicy());
@@ -112,6 +119,9 @@ class DBSCAN
//! itself) for the point to be a core-point.
size_t minPoints;
+ //! Whether or not to perform the search in batch mode. If false, single
+ bool batchMode;
+
//! Instantiated range search policy.
RangeSearchType rangeSearch;
@@ -119,33 +129,31 @@ class DBSCAN
PointSelectionPolicy pointSelector;
/**
- * This function processes the point at index. It marks the point as visited,
- * checks if the given point is core or non-core. If it is a core point, it
- * expands the cluster, otherwise it returns.
+ * Performs DBSCAN clustering on the data, returning the number of clusters and
+ * also the list of cluster assignments. This searches each point iteratively,
+ * and can save on RAM usage. It may be slower than the batch search with a
+ * dual-tree algorithm.
*
- * @tparam MatType Type of matrix (arma::mat or arma::sp_mat).
* @param data Dataset to cluster.
- * @param unvisited Remembers if a point has been visited.
- * @param index Index of point to be visited now.
- * @param assignments Vector to store cluster assignments.
- * @param currentCluster Index of cluster which will be assigned to points in
- * current cluster.
- * @param neighbors Matrix containing list of neighbors for each point which
- * fall in its epsilon-neighborhood.
- * @param distances Matrix containing list of distances for each point which
- * fall in its epsilon-neighborhood.
- * @param topLevel If true, then current point is the first point in the
- * current cluster, helps in detecting noise.
+ * @param assignments Assignments for each point.
+ * @param uf UnionFind structure that will be modified.
+ */
+ template<typename MatType>
+ void PointwiseCluster(const MatType& data,
+ emst::UnionFind& uf);
+
+ /**
+ * Performs DBSCAN clustering on the data, returning number of clusters
+ * and also the list of cluster assignments. This can perform search in batch,
+ * so it is well suited for dual-tree or naive search.
+ *
+ * @param data Dataset to cluster.
+ * @param assignments Assignments for each point.
+ * @param uf UnionFind structure that will be modified.
*/
template<typename MatType>
- size_t ProcessPoint(const MatType& data,
- boost::dynamic_bitset<>& unvisited,
- const size_t index,
- arma::Row<size_t>& assignments,
- const size_t currentCluster,
- const std::vector<std::vector<size_t>>& neighbors,
- const std::vector<std::vector<double>>& distances,
- const bool topLevel = true);
+ void BatchCluster(const MatType& data,
+ emst::UnionFind& uf);
};
} // namespace dbscan
diff --git a/src/mlpack/methods/dbscan/dbscan_impl.hpp b/src/mlpack/methods/dbscan/dbscan_impl.hpp
index 91e0b82..f485871 100644
--- a/src/mlpack/methods/dbscan/dbscan_impl.hpp
+++ b/src/mlpack/methods/dbscan/dbscan_impl.hpp
@@ -24,10 +24,12 @@ template<typename RangeSearchType, typename PointSelectionPolicy>
DBSCAN<RangeSearchType, PointSelectionPolicy>::DBSCAN(
const double epsilon,
const size_t minPoints,
+ const bool batchMode,
RangeSearchType rangeSearch,
PointSelectionPolicy pointSelector) :
epsilon(epsilon),
minPoints(minPoints),
+ batchMode(batchMode),
rangeSearch(rangeSearch),
pointSelector(pointSelector)
{
@@ -89,8 +91,8 @@ size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::Cluster(
}
/**
- * Performs DBSCAN clustering on the data, returning number of clusters
- * and also the list of cluster assignments.
+ * Performs DBSCAN clustering on the data, returning the number of clusters and
+ * also the list of cluster assignments.
*/
template<typename RangeSearchType, typename PointSelectionPolicy>
template<typename MatType>
@@ -98,83 +100,101 @@ size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::Cluster(
const MatType& data,
arma::Row<size_t>& assignments)
{
- assignments.set_size(data.n_cols);
- assignments.fill(SIZE_MAX);
+ // Initialize the UnionFind object.
+ emst::UnionFind uf(data.n_cols);
+ rangeSearch.Train(data);
- size_t currentCluster = 0;
+ if (batchMode)
+ BatchCluster(data, uf);
+ else
+ PointwiseCluster(data, uf);
- // For each point, find the points in epsilon-nighborhood and their distances.
- std::vector<std::vector<size_t>> neighbors;
- std::vector<std::vector<double>> distances;
- Log::Debug << "Performing range search." << std::endl;
- rangeSearch.Train(data);
- rangeSearch.Search(data, math::Range(0.0, epsilon), neighbors, distances);
- Log::Debug << "Range search complete." << std::endl;
+ // Now set assignments.
+ assignments.set_size(data.n_cols);
+ for (size_t i = 0; i < data.n_cols; ++i)
+ assignments[i] = uf.Find(i);
+
+ // Get a count of all clusters.
+ const size_t numClusters = arma::max(assignments) + 1;
+ arma::Col<size_t> counts(numClusters, arma::fill::zeros);
+ for (size_t i = 0; i < assignments.n_elem; ++i)
+ counts[assignments[i]]++;
- // Initialize to all true; false means it's been visited.
- boost::dynamic_bitset<> unvisited(data.n_cols);
- unvisited.set();
- while (unvisited.any())
+ // Now assign clusters to new indices.
+ size_t currentCluster = 0;
+ arma::Col<size_t> newAssignments(numClusters);
+ for (size_t i = 0; i < counts.n_elem; ++i)
{
- // Select the next unvisited point.
- const size_t nextIndex = pointSelector.Select(unvisited, data);
- Log::Debug << "Inspect point " << nextIndex << "; " << unvisited.count()
- << " unvisited points remain." << std::endl;
-
- // currentCluster will only be incremented if a cluster was created.
- currentCluster = ProcessPoint(data, unvisited, nextIndex, assignments,
- currentCluster, neighbors, distances);
+ if (counts[i] >= minPoints)
+ newAssignments[i] = currentCluster++;
+ else
+ newAssignments[i] = SIZE_MAX;
}
+ // Now reassign.
+ for (size_t i = 0; i < assignments.n_elem; ++i)
+ assignments[i] = newAssignments[assignments[i]];
+
+ Log::Info << currentCluster << " clusters found." << std::endl;
+
return currentCluster;
}
/**
- * This function processes the point at index. It marks the point as visited,
- * checks if the given point is core or non-core. If it is a core point, it
- * expands the cluster, otherwise it returns.
+ * Performs DBSCAN clustering on the data, returning the number of clusters and
+ * also the list of cluster assignments. This searches each point iteratively,
+ * and can save on RAM usage. It may be slower than the batch search with a
+ * dual-tree algorithm.
*/
template<typename RangeSearchType, typename PointSelectionPolicy>
template<typename MatType>
-size_t DBSCAN<RangeSearchType, PointSelectionPolicy>::ProcessPoint(
+void DBSCAN<RangeSearchType, PointSelectionPolicy>::PointwiseCluster(
const MatType& data,
- boost::dynamic_bitset<>& unvisited,
- const size_t index,
- arma::Row<size_t>& assignments,
- const size_t currentCluster,
- const std::vector<std::vector<size_t>>& neighbors,
- const std::vector<std::vector<double>>& distances,
- const bool topLevel)
+ emst::UnionFind& uf)
{
- // We've now visited this point.
- unvisited[index] = false;
+ std::vector<std::vector<size_t>> neighbors;
+ std::vector<std::vector<double>> distances;
- if ((neighbors[index].size() < minPoints) && topLevel)
- {
- // Mark the point as noise (leave assignments[index] unset) and return.
- unvisited[index] = false;
- return currentCluster;
- }
- else
+ for (size_t i = 0; i < data.n_cols; ++i)
{
- assignments[index] = currentCluster;
+ if (i % 10000 == 0 && i > 0)
+ Log::Info << "DBSCAN clustering on point " << i << "..." << std::endl;
- // New cluster.
- for (size_t j = 0; j < neighbors[index].size(); ++j)
- {
- // Add each point to the cluster and mark it as visited, but only if it
- // has not been visited yet.
- if (!unvisited[neighbors[index][j]])
- continue;
+ // Do the range search for only this point.
+ rangeSearch.Search(data.col(i), math::Range(0.0, epsilon), neighbors,
+ distances);
- assignments[neighbors[index][j]] = currentCluster;
- unvisited[neighbors[index][j]] = false;
+ // Union to all neighbors.
+ for (size_t j = 0; j < neighbors[0].size(); ++j)
+ uf.Union(i, neighbors[0][j]);
+ }
+}
- // Recurse into this point.
- ProcessPoint(data, unvisited, neighbors[index][j], assignments,
- currentCluster, neighbors, distances, false);
- }
- return currentCluster + 1;
+/**
+ * Performs DBSCAN clustering on the data, returning number of clusters
+ * and also the list of cluster assignments. This can perform search in batch,
+ * naive search).
+ */
+template<typename RangeSearchType, typename PointSelectionPolicy>
+template<typename MatType>
+void DBSCAN<RangeSearchType, PointSelectionPolicy>::BatchCluster(
+ const MatType& data,
+ emst::UnionFind& uf)
+{
+ // For each point, find the points in epsilon-nighborhood and their distances.
+ std::vector<std::vector<size_t>> neighbors;
+ std::vector<std::vector<double>> distances;
+ Log::Info << "Performing range search." << std::endl;
+ rangeSearch.Train(data);
+ rangeSearch.Search(data, math::Range(0.0, epsilon), neighbors, distances);
+ Log::Info << "Range search complete." << std::endl;
+
+ // Now loop over all points.
+ for (size_t i = 0; i < data.n_cols; ++i)
+ {
+ // Union to all neighbors.
+ for (size_t j = 0; j < neighbors[i].size(); ++j)
+ uf.Union(i, neighbors[i][j]);
}
}
diff --git a/src/mlpack/methods/dbscan/dbscan_main.cpp b/src/mlpack/methods/dbscan/dbscan_main.cpp
index 1a02264..917df10 100644
--- a/src/mlpack/methods/dbscan/dbscan_main.cpp
+++ b/src/mlpack/methods/dbscan/dbscan_main.cpp
@@ -43,15 +43,15 @@ PROGRAM_INFO("DBSCAN clustering",
" tree used for range search; this can take a variety of values: 'kd', 'r',"
" 'r-star', 'x', 'hilbert-r', 'r-plus', 'r-plus-plus', 'cover', 'ball'. "
"The --single_mode option will force single-tree search (as opposed to the "
- "default dual-tree search), and --naive will force brute-force range "
- "search."
+ "default dual-tree search). --single_mode can be useful when the RAM usage"
+ " of batch search is too high. The --naive option will force brute-force "
+ "range search."
"\n\n"
"An example usage to run DBSCAN on the dataset in input.csv with a radius "
"of 0.5 and a minimum cluster size of 5 is given below:"
"\n\n"
" $ mlpack_dbscan -i input.csv -e 0.5 -m 5");
-
PARAM_STRING_IN_REQ("input_file", "Input dataset to cluster.", "i");
PARAM_STRING_OUT("assignments_file", "Output file for assignments of each "
"point.", "a");
@@ -82,7 +82,8 @@ void RunDBSCAN(RangeSearchType rs = RangeSearchType())
const double epsilon = CLI::GetParam<double>("epsilon");
const size_t minSize = (size_t) CLI::GetParam<int>("min_size");
- DBSCAN<RangeSearchType> d(epsilon, minSize, rs);
+ DBSCAN<RangeSearchType> d(epsilon, minSize, !CLI::HasParam("single_mode"),
+ rs);
// If possible, avoid the overhead of calculating centroids.
arma::Row<size_t> assignments;
@@ -100,7 +101,8 @@ void RunDBSCAN(RangeSearchType rs = RangeSearchType())
}
if (CLI::HasParam("assignments_file"))
- data::Save(CLI::GetParam<string>("assignments_file"), assignments, false);
+ data::Save(CLI::GetParam<string>("assignments_file"), assignments, false,
+ false /* no transpose */);
}
int main(int argc, char** argv)
diff --git a/src/mlpack/tests/dbscan_test.cpp b/src/mlpack/tests/dbscan_test.cpp
index 3e8b5ff..cd79809 100644
--- a/src/mlpack/tests/dbscan_test.cpp
+++ b/src/mlpack/tests/dbscan_test.cpp
@@ -146,4 +146,129 @@ BOOST_AUTO_TEST_CASE(GaussiansTest)
}
}
+BOOST_AUTO_TEST_CASE(OneClusterSingleModeTest)
+{
+ // Make sure that if we have points in the unit box, and if we set epsilon
+ // large enough, all points end up as in one cluster.
+ arma::mat points(10, 200, arma::fill::randu);
+
+ DBSCAN<> d(2.0, 2, false);
+
+ arma::Row<size_t> assignments;
+ const size_t clusters = d.Cluster(points, assignments);
+
+ BOOST_REQUIRE_EQUAL(clusters, 1);
+ BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+ for (size_t i = 0; i < assignments.n_elem; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], 0);
+}
+
+/**
+ * When epsilon is small enough, every point returned should be noise.
+ */
+BOOST_AUTO_TEST_CASE(TinyEpsilonSingleModeTest)
+{
+ arma::mat points(10, 200, arma::fill::randu);
+
+ DBSCAN<> d(1e-50, 2, false);
+
+ arma::Row<size_t> assignments;
+ const size_t clusters = d.Cluster(points, assignments);
+
+ BOOST_REQUIRE_EQUAL(clusters, 0);
+ BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+ for (size_t i = 0; i < assignments.n_elem; ++i)
+ BOOST_REQUIRE_EQUAL(assignments[i], SIZE_MAX);
+}
+
+/**
+ * Check that outliers are properly labeled as noise.
+ */
+BOOST_AUTO_TEST_CASE(OutlierSingleModeTest)
+{
+ arma::mat points(2, 200, arma::fill::randu);
+
+ // Add 3 outliers.
+ points.col(15) = arma::vec("10.3 1.6");
+ points.col(45) = arma::vec("-100 0.0");
+ points.col(101) = arma::vec("1.5 1.5");
+
+ DBSCAN<> d(0.1, 3, false);
+
+ arma::Row<size_t> assignments;
+ const size_t clusters = d.Cluster(points, assignments);
+
+ BOOST_REQUIRE_GT(clusters, 0);
+ BOOST_REQUIRE_EQUAL(assignments.n_elem, points.n_cols);
+ BOOST_REQUIRE_EQUAL(assignments[15], SIZE_MAX);
+ BOOST_REQUIRE_EQUAL(assignments[45], SIZE_MAX);
+ BOOST_REQUIRE_EQUAL(assignments[101], SIZE_MAX);
+}
+
+/**
+ * Check that the Gaussian clusters are correctly found.
+ */
+BOOST_AUTO_TEST_CASE(GaussiansSingleModeTest)
+{
+ arma::mat points(3, 300);
+
+ GaussianDistribution g1(3), g2(3), g3(3);
+ g1.Mean() = arma::vec("0.0 0.0 0.0");
+ g2.Mean() = arma::vec("6.0 6.0 8.0");
+ g3.Mean() = arma::vec("-6.0 1.0 -7.0");
+ for (size_t i = 0; i < 100; ++i)
+ points.col(i) = g1.Random();
+ for (size_t i = 100; i < 200; ++i)
+ points.col(i) = g2.Random();
+ for (size_t i = 200; i < 300; ++i)
+ points.col(i) = g3.Random();
+
+ DBSCAN<> d(2.0, 3);
+
+ arma::Row<size_t> assignments;
+ arma::mat centroids;
+ const size_t clusters = d.Cluster(points, assignments, centroids);
+ BOOST_REQUIRE_EQUAL(clusters, 3);
+
+ // Our centroids should be close to one of our Gaussians.
+ arma::Row<size_t> matches(3);
+ matches.fill(3);
+ for (size_t j = 0; j < 3; ++j)
+ {
+ if (arma::norm(g1.Mean() - centroids.col(j)) < 3.0)
+ matches(0) = j;
+ else if (arma::norm(g2.Mean() - centroids.col(j)) < 3.0)
+ matches(1) = j;
+ else if (arma::norm(g3.Mean() - centroids.col(j)) < 3.0)
+ matches(2) = j;
+ }
+
+ BOOST_REQUIRE_NE(matches(0), matches(1));
+ BOOST_REQUIRE_NE(matches(1), matches(2));
+ BOOST_REQUIRE_NE(matches(2), matches(0));
+
+ BOOST_REQUIRE_NE(matches(0), 3);
+ BOOST_REQUIRE_NE(matches(1), 3);
+ BOOST_REQUIRE_NE(matches(2), 3);
+
+ for (size_t i = 0; i < 100; ++i)
+ {
+ // Each point should either be noise or in cluster matches(0).
+ BOOST_REQUIRE_NE(assignments(i), matches(1));
+ BOOST_REQUIRE_NE(assignments(i), matches(2));
+ }
+
+ for (size_t i = 100; i < 200; ++i)
+ {
+ BOOST_REQUIRE_NE(assignments(i), matches(0));
+ BOOST_REQUIRE_NE(assignments(i), matches(2));
+ }
+
+ for (size_t i = 200; i < 300; ++i)
+ {
+ BOOST_REQUIRE_NE(assignments(i), matches(0));
+ BOOST_REQUIRE_NE(assignments(i), matches(1));
+ }
+}
+
BOOST_AUTO_TEST_SUITE_END();
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list