[compute] 20/49: Using serial merge in merge algorithm for small inputs
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri Dec 18 17:58:17 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository compute.
commit e3324f887da776dc726d61beb492d4267d352d88
Author: Jakub Szuppe <j.szuppe at gmail.com>
Date: Sat Aug 1 18:46:28 2015 +0200
Using serial merge in merge algorithm for small inputs
---
.../algorithm/detail/merge_with_merge_path.hpp | 51 ++++++++++++----------
include/boost/compute/algorithm/merge.hpp | 34 +++++++++++++++
perf/perf_merge.cpp | 2 +
perf/perf_stl_merge.cpp | 4 +-
4 files changed, 65 insertions(+), 26 deletions(-)
diff --git a/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp b/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp
index 9525289..0cc0256 100644
--- a/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp
+++ b/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp
@@ -43,12 +43,12 @@ public:
class InputIterator3, class InputIterator4,
class OutputIterator, class Compare>
void set_range(InputIterator1 first1,
- InputIterator2 first2,
- InputIterator3 tile_first1,
- InputIterator3 tile_last1,
- InputIterator4 tile_first2,
- OutputIterator result,
- Compare comp)
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator result,
+ Compare comp)
{
m_count = iterator_range_size(tile_first1, tile_last1) - 1;
@@ -97,11 +97,11 @@ public:
class InputIterator3, class InputIterator4,
class OutputIterator>
void set_range(InputIterator1 first1,
- InputIterator2 first2,
- InputIterator3 tile_first1,
- InputIterator3 tile_last1,
- InputIterator4 tile_first2,
- OutputIterator result)
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator result)
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
::boost::compute::less<value_type> less_than;
@@ -140,13 +140,16 @@ private:
template<class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
inline OutputIterator
merge_with_merge_path(InputIterator1 first1,
- InputIterator1 last1,
- InputIterator2 first2,
- InputIterator2 last2,
- OutputIterator result,
- Compare comp,
- command_queue &queue = system::default_queue())
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ Compare comp,
+ command_queue &queue = system::default_queue())
{
+ typedef typename
+ std::iterator_traits<OutputIterator>::difference_type result_difference_type;
+
int tile_size = 1024;
int count1 = iterator_range_size(first1, last1);
@@ -171,22 +174,22 @@ merge_with_merge_path(InputIterator1 first1,
serial_merge_kernel merge_kernel;
merge_kernel.tile_size = 1024;
merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
- tile_b.begin(), result, comp);
+ tile_b.begin(), result, comp);
merge_kernel.exec(queue);
- return result + count1 + count2;
+ return result + static_cast<result_difference_type>(count1 + count2);
}
/// \overload
template<class InputIterator1, class InputIterator2, class OutputIterator>
inline OutputIterator
merge_with_merge_path(InputIterator1 first1,
- InputIterator1 last1,
- InputIterator2 first2,
- InputIterator2 last2,
- OutputIterator result,
- command_queue &queue = system::default_queue())
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
::boost::compute::less<value_type> less_than;
diff --git a/include/boost/compute/algorithm/merge.hpp b/include/boost/compute/algorithm/merge.hpp
index a7fcb53..9ac1c97 100644
--- a/include/boost/compute/algorithm/merge.hpp
+++ b/include/boost/compute/algorithm/merge.hpp
@@ -16,6 +16,8 @@
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
#include <boost/compute/algorithm/detail/serial_merge.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
namespace boost {
namespace compute {
@@ -48,6 +50,38 @@ inline OutputIterator merge(InputIterator1 first1,
Compare comp,
command_queue &queue = system::default_queue())
{
+ typedef typename std::iterator_traits<InputIterator1>::value_type input1_type;
+ typedef typename std::iterator_traits<InputIterator2>::value_type input2_type;
+ typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+
+ const device &device = queue.get_device();
+
+ std::string cache_key =
+ std::string("__boost_merge_") + type_name<input1_type>() + "_"
+ + type_name<input2_type>() + "_" + type_name<output_type>();
+ boost::shared_ptr<detail::parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // default serial merge threshold depends on device type
+ size_t default_serial_merge_threshold = 32768;
+ if(device.type() & device::gpu) {
+ default_serial_merge_threshold = 2048;
+ }
+
+ // loading serial merge threshold parameter
+ const size_t serial_merge_threshold =
+ parameters->get(cache_key, "serial_merge_threshold",
+ default_serial_merge_threshold);
+
+ // choosing merge algorithm
+ const size_t total_count =
+ detail::iterator_range_size(first1, last1)
+ + detail::iterator_range_size(first2, last2);
+ // for small inputs serial merge turns out to outperform
+ // merge with merge path algorithm
+ if(total_count <= serial_merge_threshold){
+ return detail::serial_merge(first1, last1, first2, last2, result, comp, queue);
+ }
return detail::merge_with_merge_path(first1, last1, first2, last2, result, comp, queue);
}
diff --git a/perf/perf_merge.cpp b/perf/perf_merge.cpp
index 2aac4af..58ea836 100644
--- a/perf/perf_merge.cpp
+++ b/perf/perf_merge.cpp
@@ -28,6 +28,7 @@ int main(int argc, char *argv[])
boost::compute::device device = boost::compute::system::default_device();
boost::compute::context context(device);
boost::compute::command_queue queue(context, device);
+ std::cout << "device: " << device.name() << std::endl;
std::vector<int> v1 = generate_random_vector<int>(std::floor(PERF_N / 2.0));
std::vector<int> v2 = generate_random_vector<int>(std::ceil(PERF_N / 2.0));
@@ -48,6 +49,7 @@ int main(int argc, char *argv[])
gpu_v3.begin(),
queue
);
+ queue.finish();
t.stop();
}
std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
diff --git a/perf/perf_stl_merge.cpp b/perf/perf_stl_merge.cpp
index 977134f..0a842a0 100644
--- a/perf/perf_stl_merge.cpp
+++ b/perf/perf_stl_merge.cpp
@@ -19,8 +19,8 @@ int main(int argc, char *argv[])
perf_parse_args(argc, argv);
std::cout << "size: " << PERF_N << std::endl;
- std::vector<int> v1 = generate_random_vector<int>(PERF_N / 2);
- std::vector<int> v2 = generate_random_vector<int>(PERF_N / 2);
+ std::vector<int> v1 = generate_random_vector<int>(std::floor(PERF_N / 2.0));
+ std::vector<int> v2 = generate_random_vector<int>(std::ceil(PERF_N / 2.0));
std::vector<int> v3(PERF_N);
std::sort(v1.begin(), v1.end());
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/compute.git
More information about the debian-science-commits
mailing list