[mlpack] 91/207: Implement multidimensional discrete distribution
Barak A. Pearlmutter
barak+git at pearlmutter.net
Thu Mar 23 17:53:43 UTC 2017
This is an automated email from the git hooks/post-receive script.
bap pushed a commit to branch master
in repository mlpack.
commit e17939783300ba58c825c8ca32dcc9bda27251c9
Author: Tiramisu 1993 <sabergeass at gmail.com>
Date: Sat Feb 18 13:10:18 2017 +0800
Implement multidimensional discrete distribution
---
src/mlpack/core/dists/discrete_distribution.cpp | 138 +++++++++++++++---------
src/mlpack/core/dists/discrete_distribution.hpp | 91 +++++++++++-----
src/mlpack/tests/distribution_test.cpp | 67 ++++++++++++
src/mlpack/tests/hmm_test.cpp | 14 +--
src/mlpack/tests/serialization_test.cpp | 3 +-
5 files changed, 228 insertions(+), 85 deletions(-)
diff --git a/src/mlpack/core/dists/discrete_distribution.cpp b/src/mlpack/core/dists/discrete_distribution.cpp
index b93004e..66b1fa5 100644
--- a/src/mlpack/core/dists/discrete_distribution.cpp
+++ b/src/mlpack/core/dists/discrete_distribution.cpp
@@ -20,22 +20,29 @@ using namespace mlpack::distribution;
*/
arma::vec DiscreteDistribution::Random() const
{
- // Generate a random number.
- double randObs = math::Random();
- arma::vec result(1);
+ size_t dimension = probabilities.size();
+ arma::vec result(dimension);
- double sumProb = 0;
- for (size_t obs = 0; obs < probabilities.n_elem; obs++)
+ for (size_t d = 0; d < dimension; d++)
{
- if ((sumProb += probabilities[obs]) >= randObs)
+ // Generate a random number.
+ double randObs = math::Random();
+
+ double sumProb = 0;
+
+ for (size_t obs = 0; obs < probabilities[d].n_elem; obs++)
{
- result[0] = obs;
- return result;
+ if ((sumProb += probabilities[d][obs]) >= randObs)
+ {
+ result[d] = obs;
+ break;
+ }
}
- }
- // This shouldn't happen.
- result[0] = probabilities.n_elem - 1;
+ if (sumProb >= randObs != true)
+ // This shouldn't happen.
+ result[d] = probabilities[d].n_elem - 1;
+ }
return result;
}
@@ -44,33 +51,45 @@ arma::vec DiscreteDistribution::Random() const
*/
void DiscreteDistribution::Train(const arma::mat& observations)
{
- // Clear old probabilities.
- probabilities.zeros();
+ // Make sure the observations have same dimension with the probabilities
+ if(observations.n_rows != probabilities.size())
+ {
+ Log::Debug << "the obversation must has the same dimension with the probabilities"
+ << "the observation's dimension is" << observations.n_cols << "but the dimension of "
+ << "probabilities is" << probabilities.size() << std::endl;
+ }
+ // Get the dimension size of the distribution
+ const size_t dimensions = probabilities.size();
- // Add the probability of each observation. The addition of 0.5 to the
- // observation is to turn the default flooring operation of the size_t cast
- // into a rounding operation.
- for (size_t i = 0; i < observations.n_cols; i++)
+ // Iterate all the probabilities in each dimension
+ for (size_t i=0; i < dimensions; i++)
{
- const size_t obs = size_t(observations(0, i) + 0.5);
+ // Clear the old probabilities
+ probabilities[i].zeros();
+ for (size_t r=0; r < observations.n_cols; r++)
+ {
+ // Add the probability of each observation. The addition of 0.5 to the
+ // observation is to turn the default flooring operation of the size_t cast
+ // into a rounding observation.
+ const size_t obs = size_t(observations(i, r) + 0.5);
- // Ensure that the observation is within the bounds.
- if (obs >= probabilities.n_elem)
- {
- Log::Debug << "DiscreteDistribution::Train(): observation " << i
- << " (" << obs << ") is invalid; observation must be in [0, "
- << probabilities.n_elem << "] for this distribution." << std::endl;
- }
+ // Ensure that the observation is within the bounds.
+ if (obs >= probabilities[i].n_elem)
+ {
+ Log::Debug << "DiscreteDistribution::Train(): observation " << i
+ << " (" << obs << ") is invalid; observation must be in [0, "
+ << probabilities[i].n_elem << "] for this distribution." << std::endl;
+ }
+ probabilities[i][obs]++;
+ }
- probabilities(obs)++;
+ // Now normailze the distribution.
+ double sum = accu(probabilities[i]);
+ if (sum > 0)
+ probabilities[i] /= sum;
+ else
+ probabilities[i].fill(1.0 / probabilities[i].n_elem); // Force normalization.
}
-
- // Now normalize the distribution.
- double sum = accu(probabilities);
- if (sum > 0)
- probabilities /= sum;
- else
- probabilities.fill(1 / probabilities.n_elem); // Force normalization.
}
/**
@@ -80,31 +99,46 @@ void DiscreteDistribution::Train(const arma::mat& observations)
void DiscreteDistribution::Train(const arma::mat& observations,
const arma::vec& probObs)
{
- // Clear old probabilities.
- probabilities.zeros();
+ // Make sure the observations have same dimension with the probabilities
+ if(observations.n_rows != probabilities.size())
+ {
+ Log::Debug << "the obversation must has the same dimension with the probabilities"
+ << "the observation's dimension is" << observations.n_rows<< "but the dimension of "
+ << "probabilities is" << probabilities.size() << std::endl;
+ }
- // Add the probability of each observation. The addition of 0.5 to the
- // observation is to turn the default flooring operation of the size_t cast
- // into a rounding observation.
- for (size_t i = 0; i < observations.n_cols; i++)
+ // Get the dimension size of the distribution
+ size_t dimensions = probabilities.size();
+ for (size_t i=0; i < dimensions; i++)
{
- const size_t obs = size_t(observations(0, i) + 0.5);
+ // Clear the old probabilities
+ probabilities[i].zeros();
// Ensure that the observation is within the bounds.
- if (obs >= probabilities.n_elem)
+ for (size_t r=0; r < observations.n_cols; r++)
{
- Log::Debug << "DiscreteDistribution::Train(): observation " << i
- << " (" << obs << ") is invalid; observation must be in [0, "
- << probabilities.n_elem << "] for this distribution." << std::endl;
+ // Add the probability of each observation. The addition of 0.5 to the
+ // observation is to turn the default flooring operation of the size_t cast
+ // into a rounding observation.
+
+ const size_t obs = size_t(observations(i, r)+ 0.5);
+
+ // Ensure that the observation is within the bounds.
+ if (obs >= probabilities[i].n_elem)
+ {
+ Log::Debug << "DiscreteDistribution::Train(): observation " << i
+ << " (" << obs << ") is invalid; observation must be in [0, "
+ << probabilities[i].n_elem << "] for this distribution." << std::endl;
+ }
+
+ probabilities[i][obs] += probObs[r];
}
- probabilities(obs) += probObs[i];
+ // Now normailze the distribution.
+ double sum = accu(probabilities[i]);
+ if (sum > 0)
+ probabilities[i] /= sum;
+ else
+ probabilities[i].fill(1.0 / probabilities[i].n_elem); // Force normalization.
}
-
- // Now normalize the distribution.
- double sum = accu(probabilities);
- if (sum > 0)
- probabilities /= sum;
- else
- probabilities.fill(1 / probabilities.n_elem); // Force normalization.
}
diff --git a/src/mlpack/core/dists/discrete_distribution.hpp b/src/mlpack/core/dists/discrete_distribution.hpp
index a31ca0c..f3ccded 100644
--- a/src/mlpack/core/dists/discrete_distribution.hpp
+++ b/src/mlpack/core/dists/discrete_distribution.hpp
@@ -48,7 +48,8 @@ class DiscreteDistribution
/**
* Default constructor, which creates a distribution that has no observations.
*/
- DiscreteDistribution() { /* nothing to do */ }
+ DiscreteDistribution() :
+ probabilities(std::vector<arma::vec>(1)){ /* nothing to do */ }
/**
* Define the discrete distribution as having numObservations possible
@@ -59,32 +60,56 @@ class DiscreteDistribution
* can have.
*/
DiscreteDistribution(const size_t numObservations) :
- probabilities(arma::ones<arma::vec>(numObservations) / numObservations)
+ probabilities(std::vector<arma::vec>(1, arma::ones<arma::vec>(numObservations)/numObservations))
{ /* nothing to do */ }
/**
- * Define the discrete distribution as having the given probabilities for each
+ * Define the multidimensional discrete distribution as having numObservations possible
+ * observations. The probability in each state will be set to (1 /
+ * numObservations of each dimension).
+ *
+ * @param numObservations Number of possible observations this distribution
+ * can have.
+ */
+ DiscreteDistribution(const arma::vec& numObservations)
+ {
+ for (size_t i=0; i<numObservations.n_elem; i++)
+ {
+ const size_t numObs = size_t(numObservations[i]);
+ if (numObs <= 0)
+ {
+ Log::Debug << "The number of observation in each dimension must greater than 0"
+ << "but the given observation number in"<< i <<" dimension is "<< numObs << std::endl;
+ }
+ probabilities.push_back(arma::ones<arma::vec>(numObs)/numObs);
+ }
+ }
+
+ /**
+ * Define the multidimensional discrete distribution as having the given probabilities for each
* observation.
*
* @param probabilities Probabilities of each possible observation.
*/
- DiscreteDistribution(const arma::vec& probabilities)
+ DiscreteDistribution(const std::vector<arma::vec>& probabilities)
{
- // We must be sure that our distribution is normalized.
- double sum = accu(probabilities);
- if (sum > 0)
- this->probabilities = probabilities / sum;
- else
+ for (size_t i=0; i<probabilities.size(); i++)
{
- this->probabilities.set_size(probabilities.n_elem);
- this->probabilities.fill(1 / probabilities.n_elem);
+ arma::vec temp = probabilities[i];
+ double sum = accu(temp);
+ if (sum > 0)
+ this->probabilities.push_back(temp / sum);
+ else
+ {
+ this->probabilities.push_back(arma::ones<arma::vec>(temp.n_elem)/temp.n_elem);
+ }
}
}
/**
* Get the dimensionality of the distribution.
*/
- static size_t Dimensionality() { return 1; }
+ size_t Dimensionality() const { return probabilities.size(); }
/**
* Return the probability of the given observation. If the observation is
@@ -96,19 +121,32 @@ class DiscreteDistribution
*/
double Probability(const arma::vec& observation) const
{
- // Adding 0.5 helps ensure that we cast the floating point to a size_t
- // correctly.
- const size_t obs = size_t(observation[0] + 0.5);
-
- // Ensure that the observation is within the bounds.
- if (obs >= probabilities.n_elem)
+ double probability = 1.0;
+ // Ensure the observation has the same dimension with the probabilities
+ if (observation.n_elem != probabilities.size())
{
- Log::Debug << "DiscreteDistribution::Probability(): received observation "
- << obs << "; observation must be in [0, " << probabilities.n_elem
- << "] for this distribution." << std::endl;
+ Log::Debug << "the obversation must has the same dimension with the probabilities"
+ << "the observation's dimension is" << observation.n_elem << "but the dimension of "
+ << "probabilities is" << probabilities.size() << std::endl;
+ return probability;
+ }
+ for (size_t dimension = 0; dimension < observation.n_elem; dimension++)
+ {
+ // Adding 0.5 helps ensure that we cast the floating point to a size_t
+ // correctly.
+ const size_t obs = size_t(observation(dimension) + 0.5);
+
+ // Ensure that the observation is within the bounds.
+ if (obs >= probabilities[dimension].n_elem)
+ {
+ Log::Debug << "DiscreteDistribution::Probability(): received observation "
+ << obs << "; observation must be in [0, " << probabilities[dimension].n_elem
+ << "] for this distribution." << std::endl;
+ }
+ probability *= probabilities[dimension][obs];
}
- return probabilities(obs);
+ return probability;
}
/**
@@ -156,9 +194,10 @@ class DiscreteDistribution
const arma::vec& probabilities);
//! Return the vector of probabilities.
- const arma::vec& Probabilities() const { return probabilities; }
+ arma::vec& Probabilities(const size_t dim = 0) { return probabilities[dim];}
+
//! Modify the vector of probabilities.
- arma::vec& Probabilities() { return probabilities; }
+ const arma::vec& Probabilities(const size_t dim = 0) const { return probabilities[dim];}
/**
* Serialize the distribution.
@@ -171,7 +210,9 @@ class DiscreteDistribution
}
private:
- arma::vec probabilities;
+ // The Probability Martix which each column represent one dimension and
+ // the row represent the observations in that dimension.
+ std::vector<arma::vec> probabilities;
};
} // namespace distribution
diff --git a/src/mlpack/tests/distribution_test.cpp b/src/mlpack/tests/distribution_test.cpp
index 7de0776..b5597e7 100644
--- a/src/mlpack/tests/distribution_test.cpp
+++ b/src/mlpack/tests/distribution_test.cpp
@@ -119,6 +119,73 @@ BOOST_AUTO_TEST_CASE(DiscreteDistributionTrainProbTest)
BOOST_REQUIRE_CLOSE(d.Probability("2"), 0.5, 1e-5);
}
+/**
+ * Achieve multidimensional probability distribution
+ */
+BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionTrainProbTest)
+{
+ DiscreteDistribution d("10 10 10");
+
+ arma::mat obs("0 1 1 1 2 2 2 2 2 2;"
+ "0 0 0 1 1 1 2 2 2 2;"
+ "0 0 0 1 1 2 2 2 2 2;");
+
+ d.Train(obs);
+ BOOST_REQUIRE_CLOSE(d.Probability("0 0 0"), 0.009, 1e-5);
+ BOOST_REQUIRE_CLOSE(d.Probability("0 1 2"), 0.015, 1e-5);
+ BOOST_REQUIRE_CLOSE(d.Probability("2 1 0"), 0.054, 1e-5);
+}
+
+/**
+ * Make sure we initialize multidimensional probability distribution
+ * correctly.
+ */
+BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionConstructorTest)
+{
+ DiscreteDistribution d("4 4 4 4");
+
+ BOOST_REQUIRE_EQUAL(d.Probabilities(0).size(), 4);
+ BOOST_REQUIRE_EQUAL(d.Dimensionality(), 4);
+ BOOST_REQUIRE_CLOSE(d.Probability("0 0 0 0"), 0.00390625, 1e-5);
+ BOOST_REQUIRE_CLOSE(d.Probability("0 1 2 3"), 0.00390625, 1e-5);
+}
+
+/**
+ * Achieve multidimensional probability distribution
+ */
+BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionTrainTest)
+{
+ std::vector<arma::vec> pro{{0.1, 0.3, 0.6},
+ {0.3, 0.3, 0.3},
+ {0.25, 0.25, 0.5}};
+
+ DiscreteDistribution d(pro);
+
+ BOOST_REQUIRE_CLOSE(d.Probability("0 0 0"), 0.0083333, 1e-3);
+ BOOST_REQUIRE_CLOSE(d.Probability("0 1 2"), 0.0166666, 1e-3);
+ BOOST_REQUIRE_CLOSE(d.Probability("2 1 0"), 0.05, 1e-5);
+}
+
+/**
+ * Estimate multidimensional probability distribution from observations with probabilities.
+ */
+BOOST_AUTO_TEST_CASE(MultiDiscreteDistributionTrainProTest)
+{
+ DiscreteDistribution d("5 5 5");
+
+ arma::mat obs("0 0 1 1 2;"
+ "0 1 1 2 2;"
+ "0 1 1 2 2");
+
+ arma::vec prob("0.25 0.25 0.25 0.25 1");
+
+ d.Train(obs, prob);
+
+ BOOST_REQUIRE_CLOSE(d.Probability("0 0 0"), 0.00390625, 1e-5);
+ BOOST_REQUIRE_CLOSE(d.Probability("1 0 1"), 0.0078125, 1e-5);
+ BOOST_REQUIRE_CLOSE(d.Probability("2 1 0"), 0.015625, 1e-5);
+}
+
/*********************************/
/** Gaussian Distribution Tests **/
/*********************************/
diff --git a/src/mlpack/tests/hmm_test.cpp b/src/mlpack/tests/hmm_test.cpp
index c9016c5..4117571 100644
--- a/src/mlpack/tests/hmm_test.cpp
+++ b/src/mlpack/tests/hmm_test.cpp
@@ -41,8 +41,8 @@ BOOST_AUTO_TEST_CASE(SimpleDiscreteHMMTestViterbi)
arma::vec initial("1 0"); // Default MATLAB initial states.
arma::mat transition("0.7 0.3; 0.3 0.7");
std::vector<DiscreteDistribution> emission(2);
- emission[0] = DiscreteDistribution("0.9 0.1");
- emission[1] = DiscreteDistribution("0.2 0.8");
+ emission[0] = DiscreteDistribution(std::vector<arma::vec>{"0.9 0.1"});
+ emission[1] = DiscreteDistribution(std::vector<arma::vec>{"0.2 0.8"});
HMM<DiscreteDistribution> hmm(initial, transition, emission);
@@ -78,9 +78,9 @@ BOOST_AUTO_TEST_CASE(BorodovskyHMMTestViterbi)
"0.5 0.5 0.6");
// Four emission states: A, C, G, T. Start state doesn't emit...
std::vector<DiscreteDistribution> emission(3);
- emission[0] = DiscreteDistribution("0.25 0.25 0.25 0.25");
- emission[1] = DiscreteDistribution("0.20 0.30 0.30 0.20");
- emission[2] = DiscreteDistribution("0.30 0.20 0.20 0.30");
+ emission[0] = DiscreteDistribution(std::vector<arma::vec>{"0.25 0.25 0.25 0.25"});
+ emission[1] = DiscreteDistribution(std::vector<arma::vec>{"0.20 0.30 0.30 0.20"});
+ emission[2] = DiscreteDistribution(std::vector<arma::vec>{"0.30 0.20 0.20 0.30"});
HMM<DiscreteDistribution> hmm(initial, transition, emission);
@@ -118,8 +118,8 @@ BOOST_AUTO_TEST_CASE(ForwardBackwardTwoState)
arma::vec initial("0.1 0.4");
arma::mat transition("0.1 0.9; 0.4 0.6");
std::vector<DiscreteDistribution> emis(2);
- emis[0] = DiscreteDistribution("0.85 0.15 0.00 0.00");
- emis[1] = DiscreteDistribution("0.00 0.00 0.50 0.50");
+ emis[0] = DiscreteDistribution(std::vector<arma::vec>{"0.85 0.15 0.00 0.00"});
+ emis[1] = DiscreteDistribution(std::vector<arma::vec>{"0.00 0.00 0.50 0.50"});
HMM<DiscreteDistribution> hmm(initial, transition, emis);
diff --git a/src/mlpack/tests/serialization_test.cpp b/src/mlpack/tests/serialization_test.cpp
index fd9925b..0f27bcc 100644
--- a/src/mlpack/tests/serialization_test.cpp
+++ b/src/mlpack/tests/serialization_test.cpp
@@ -156,7 +156,8 @@ BOOST_AUTO_TEST_CASE(DiscreteDistributionTest)
// straightforward.
vec prob;
prob.randu(12);
- DiscreteDistribution t(prob);
+ std::vector<arma::vec> prob_vector = std::vector<arma::vec>(1, prob);
+ DiscreteDistribution t(prob_vector);
DiscreteDistribution xmlT, textT, binaryT;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mlpack.git
More information about the debian-science-commits
mailing list