[compute] 20/49: Using serial merge in merge algorithm for small inputs

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Fri Dec 18 17:58:17 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository compute.

commit e3324f887da776dc726d61beb492d4267d352d88
Author: Jakub Szuppe <j.szuppe at gmail.com>
Date:   Sat Aug 1 18:46:28 2015 +0200

    Using serial merge in merge algorithm for small inputs
---
 .../algorithm/detail/merge_with_merge_path.hpp     | 51 ++++++++++++----------
 include/boost/compute/algorithm/merge.hpp          | 34 +++++++++++++++
 perf/perf_merge.cpp                                |  2 +
 perf/perf_stl_merge.cpp                            |  4 +-
 4 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp b/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp
index 9525289..0cc0256 100644
--- a/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp
+++ b/include/boost/compute/algorithm/detail/merge_with_merge_path.hpp
@@ -43,12 +43,12 @@ public:
              class InputIterator3, class InputIterator4,
              class OutputIterator, class Compare>
     void set_range(InputIterator1 first1,
-                    InputIterator2 first2,
-                    InputIterator3 tile_first1,
-                    InputIterator3 tile_last1,
-                    InputIterator4 tile_first2,
-                    OutputIterator result,
-                    Compare comp)
+                   InputIterator2 first2,
+                   InputIterator3 tile_first1,
+                   InputIterator3 tile_last1,
+                   InputIterator4 tile_first2,
+                   OutputIterator result,
+                   Compare comp)
     {
         m_count = iterator_range_size(tile_first1, tile_last1) - 1;
 
@@ -97,11 +97,11 @@ public:
              class InputIterator3, class InputIterator4,
              class OutputIterator>
     void set_range(InputIterator1 first1,
-                    InputIterator2 first2,
-                    InputIterator3 tile_first1,
-                    InputIterator3 tile_last1,
-                    InputIterator4 tile_first2,
-                    OutputIterator result)
+                   InputIterator2 first2,
+                   InputIterator3 tile_first1,
+                   InputIterator3 tile_last1,
+                   InputIterator4 tile_first2,
+                   OutputIterator result)
     {
         typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
         ::boost::compute::less<value_type> less_than;
@@ -140,13 +140,16 @@ private:
 template<class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
 inline OutputIterator
 merge_with_merge_path(InputIterator1 first1,
-                        InputIterator1 last1,
-                        InputIterator2 first2,
-                        InputIterator2 last2,
-                        OutputIterator result,
-                        Compare comp,
-                        command_queue &queue = system::default_queue())
+                      InputIterator1 last1,
+                      InputIterator2 first2,
+                      InputIterator2 last2,
+                      OutputIterator result,
+                      Compare comp,
+                      command_queue &queue = system::default_queue())
 {
+   typedef typename
+       std::iterator_traits<OutputIterator>::difference_type result_difference_type;
+
     int tile_size = 1024;
 
     int count1 = iterator_range_size(first1, last1);
@@ -171,22 +174,22 @@ merge_with_merge_path(InputIterator1 first1,
     serial_merge_kernel merge_kernel;
     merge_kernel.tile_size = 1024;
     merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
-                            tile_b.begin(), result, comp);
+                           tile_b.begin(), result, comp);
 
     merge_kernel.exec(queue);
 
-    return result + count1 + count2;
+    return result + static_cast<result_difference_type>(count1 + count2);
 }
 
 /// \overload
 template<class InputIterator1, class InputIterator2, class OutputIterator>
 inline OutputIterator
 merge_with_merge_path(InputIterator1 first1,
-                        InputIterator1 last1,
-                        InputIterator2 first2,
-                        InputIterator2 last2,
-                        OutputIterator result,
-                        command_queue &queue = system::default_queue())
+                      InputIterator1 last1,
+                      InputIterator2 first2,
+                      InputIterator2 last2,
+                      OutputIterator result,
+                      command_queue &queue = system::default_queue())
 {
     typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
     ::boost::compute::less<value_type> less_than;
diff --git a/include/boost/compute/algorithm/merge.hpp b/include/boost/compute/algorithm/merge.hpp
index a7fcb53..9ac1c97 100644
--- a/include/boost/compute/algorithm/merge.hpp
+++ b/include/boost/compute/algorithm/merge.hpp
@@ -16,6 +16,8 @@
 #include <boost/compute/algorithm/copy.hpp>
 #include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
 #include <boost/compute/algorithm/detail/serial_merge.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
 
 namespace boost {
 namespace compute {
@@ -48,6 +50,38 @@ inline OutputIterator merge(InputIterator1 first1,
                             Compare comp,
                             command_queue &queue = system::default_queue())
 {
+    typedef typename std::iterator_traits<InputIterator1>::value_type input1_type;
+    typedef typename std::iterator_traits<InputIterator2>::value_type input2_type;
+    typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+
+    const device &device = queue.get_device();
+
+    std::string cache_key =
+        std::string("__boost_merge_") + type_name<input1_type>() + "_"
+        + type_name<input2_type>() + "_" + type_name<output_type>();
+    boost::shared_ptr<detail::parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // default serial merge threshold depends on device type
+    size_t default_serial_merge_threshold = 32768;
+    if(device.type() & device::gpu) {
+        default_serial_merge_threshold = 2048;
+    }
+
+    // loading serial merge threshold parameter
+    const size_t serial_merge_threshold =
+                   parameters->get(cache_key, "serial_merge_threshold",
+                                   default_serial_merge_threshold);
+
+    // choosing merge algorithm
+    const size_t total_count =
+        detail::iterator_range_size(first1, last1)
+        + detail::iterator_range_size(first2, last2);
+    // for small inputs serial merge turns out to outperform
+    // merge with merge path algorithm
+    if(total_count <= serial_merge_threshold){
+       return detail::serial_merge(first1, last1, first2, last2, result, comp, queue);
+    }
     return detail::merge_with_merge_path(first1, last1, first2, last2, result, comp, queue);
 }
 
diff --git a/perf/perf_merge.cpp b/perf/perf_merge.cpp
index 2aac4af..58ea836 100644
--- a/perf/perf_merge.cpp
+++ b/perf/perf_merge.cpp
@@ -28,6 +28,7 @@ int main(int argc, char *argv[])
     boost::compute::device device = boost::compute::system::default_device();
     boost::compute::context context(device);
     boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
 
     std::vector<int> v1 = generate_random_vector<int>(std::floor(PERF_N / 2.0));
     std::vector<int> v2 = generate_random_vector<int>(std::ceil(PERF_N / 2.0));
@@ -48,6 +49,7 @@ int main(int argc, char *argv[])
                               gpu_v3.begin(),
                               queue
         );
+        queue.finish();
         t.stop();
     }
     std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
diff --git a/perf/perf_stl_merge.cpp b/perf/perf_stl_merge.cpp
index 977134f..0a842a0 100644
--- a/perf/perf_stl_merge.cpp
+++ b/perf/perf_stl_merge.cpp
@@ -19,8 +19,8 @@ int main(int argc, char *argv[])
     perf_parse_args(argc, argv);
 
     std::cout << "size: " << PERF_N << std::endl;
-    std::vector<int> v1 = generate_random_vector<int>(PERF_N / 2);
-    std::vector<int> v2 = generate_random_vector<int>(PERF_N / 2);
+    std::vector<int> v1 = generate_random_vector<int>(std::floor(PERF_N / 2.0));
+    std::vector<int> v2 = generate_random_vector<int>(std::ceil(PERF_N / 2.0));
     std::vector<int> v3(PERF_N);
 
     std::sort(v1.begin(), v1.end());

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/compute.git



More information about the debian-science-commits mailing list