[compute] 26/46: Fix find_extrema_with_reduce

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Dec 21 18:28:42 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository compute.

commit 2d972fe2a4b7318e40db9451ea93cfa15808476b
Author: Jakub Szuppe <j.szuppe at gmail.com>
Date:   Thu Sep 24 20:29:25 2015 +0200

    Fix find_extrema_with_reduce
    
    Now find_extrema_with_reduce always return the first extremum
    in the given range.
---
 .../algorithm/detail/find_extrema_with_reduce.hpp  | 345 +++++++++++++--------
 1 file changed, 224 insertions(+), 121 deletions(-)

diff --git a/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
index 55fb688..a157e24 100644
--- a/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
+++ b/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
@@ -12,11 +12,11 @@
 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
 
 #include <algorithm>
-#include <vector>
 
 #include <boost/compute/types.hpp>
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/allocator/pinned_allocator.hpp>
 #include <boost/compute/container/vector.hpp>
 #include <boost/compute/detail/meta_kernel.hpp>
 #include <boost/compute/detail/iterator_range_size.hpp>
@@ -24,7 +24,6 @@
 #include <boost/compute/memory/local_buffer.hpp>
 #include <boost/compute/type_traits/type_name.hpp>
 #include <boost/compute/utility/program_cache.hpp>
-#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
 
 namespace boost {
 namespace compute {
@@ -73,16 +72,24 @@ bool find_extrema_with_reduce_requirements_met(InputIterator first,
     return ((required_local_mem_size * 4) <= local_mem_size);
 }
 
+/// \internal_
+/// Algorithm finds the first extremum in given range, i.e., with the lowest
+/// index.
+///
+/// If \p use_input_idx is false, it's assumed that input data is ordered by
+/// increasing index and \p input_idx is not used in the algorithm.
 template<class InputIterator, class ResultIterator, class Compare>
-inline size_t find_extrema_with_reduce(InputIterator first,
-                                       size_t count,
-                                       ResultIterator result,
-                                       vector<uint_>::iterator result_idx,
-                                       size_t work_groups_no,
-                                       size_t work_group_size,
-                                       Compare compare,
-                                       const bool find_minimum,
-                                       command_queue &queue)
+inline void find_extrema_with_reduce(InputIterator input,
+                                     vector<uint_>::iterator input_idx,
+                                     size_t count,
+                                     ResultIterator result,
+                                     vector<uint_>::iterator result_idx,
+                                     size_t work_groups_no,
+                                     size_t work_group_size,
+                                     Compare compare,
+                                     const bool find_minimum,
+                                     const bool use_input_idx,
+                                     command_queue &queue)
 {
     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
 
@@ -90,44 +97,68 @@ inline size_t find_extrema_with_reduce(InputIterator first,
 
     meta_kernel k("find_extrema_reduce");
     size_t count_arg = k.add_arg<uint_>("count");
-    size_t output_arg = k.add_arg<input_type *>(memory_object::global_memory, "output");
-    size_t output_idx_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output_idx");
     size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
     size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
 
     k <<
         // Work item global id
         k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
-        //
-        "if(gid >= count) {\n return;\n }\n" <<
+        "if(gid >= count) {\n    return;\n}\n" <<
 
         // Index of element that will be read from input buffer
         k.decl<uint_>("idx") << " = gid;\n" <<
 
         k.decl<input_type>("acc") << ";\n" <<
-        // Index of currently best element
+        // Real index of currently best element
+        "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+        k.decl<input_type>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+        "#else\n" <<
         k.decl<uint_>("acc_idx") << " = idx;\n" <<
+        "#endif\n" <<
 
         // Init accumulator with first[get_global_id(0)]
-        "acc = " << first[k.var<uint_>("idx")] << ";\n" <<
+        "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
         "idx += get_global_size(0);\n" <<
 
         k.decl<bool>("compare_result") << ";\n" <<
+        k.decl<bool>("equal") << ";\n\n" <<
         "while( idx < count ){\n" <<
             // Next element
-            k.decl<input_type>("next") << " = " << first[k.var<uint_>("idx")] << ";\n" <<
+            k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
+            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+            "#endif\n" <<
+
             // Comparison between currently best element (acc) and next element
-            "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
-            "compare_result = " << compare(k.var<input_type>("acc"),
-                                           k.var<input_type>("next")) << ";\n" <<
-            "#else\n" <<
+            "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
             "compare_result = " << compare(k.var<input_type>("next"),
                                            k.var<input_type>("acc")) << ";\n" <<
+            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            "equal = !compare_result && !" <<
+                compare(k.var<input_type>("acc"),
+                        k.var<input_type>("next")) << ";\n" <<
+            "# endif\n" <<
+            "#else\n" <<
+            "compare_result = " << compare(k.var<input_type>("acc"),
+                                           k.var<input_type>("next")) << ";\n" <<
+            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            "equal = !compare_result && !" <<
+                compare(k.var<input_type>("next"),
+                        k.var<input_type>("acc")) << ";\n" <<
+            "# endif\n" <<
             "#endif\n" <<
+
+            // save the winner
             "acc = compare_result ? acc : next;\n" <<
+            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            "acc_idx = compare_result ? " <<
+                "acc_idx : " <<
+                "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
+            "#else\n" <<
             "acc_idx = compare_result ? acc_idx : idx;\n" <<
+            "#endif\n" <<
             "idx += get_global_size(0);\n" <<
-        "}\n" <<
+        "}\n\n" <<
 
         // Work item local id
         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
@@ -135,7 +166,8 @@ inline size_t find_extrema_with_reduce(InputIterator first,
         "block_idx[lid] = acc_idx;\n" <<
         "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
 
-        k.decl<uint_>("group_offset") << " = count - (get_local_size(0) * get_group_id(0));\n";
+        k.decl<uint_>("group_offset") <<
+            " = count - (get_local_size(0) * get_group_id(0));\n\n";
 
     k <<
         "#pragma unroll\n"
@@ -144,35 +176,46 @@ inline size_t find_extrema_with_reduce(InputIterator first,
              "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
                  k.decl<input_type>("mine") << " = block[lid];\n" <<
                  k.decl<input_type>("other") << " = block[lid+offset];\n" <<
-                 "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
-                 "compare_result = " << compare(k.var<input_type>("mine"),
-                                                k.var<input_type>("other")) << ";\n" <<
-                 "#else\n" <<
+                 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
                  "compare_result = " << compare(k.var<input_type>("other"),
                                                 k.var<input_type>("mine")) << ";\n" <<
+                 "equal = !compare_result && !" <<
+                     compare(k.var<input_type>("mine"),
+                             k.var<input_type>("other")) << ";\n" <<
+                 "#else\n" <<
+                 "compare_result = " << compare(k.var<input_type>("mine"),
+                                                k.var<input_type>("other")) << ";\n" <<
+                 "equal = !compare_result && !" <<
+                     compare(k.var<input_type>("other"),
+                             k.var<input_type>("mine")) << ";\n" <<
                  "#endif\n" <<
                  "block[lid] = compare_result ? mine : other;\n" <<
+                 k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
+                 k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
                  "block_idx[lid] = compare_result ? " <<
-                     "block_idx[lid] : block_idx[lid+offset];\n" <<
+                     "mine_idx : " <<
+                     "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
              "}\n"
              "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
-        "}\n" <<
+        "}\n\n" <<
 
          // write block result to global output
         "if(lid == 0){\n" <<
-        "    output[get_group_id(0)] = block[0];\n" <<
-        "    output_idx[get_group_id(0)] = block_idx[0];\n" <<
+            result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
+            result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
         "}";
 
     std::string options;
     if(!find_minimum){
         options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
     }
+    if(use_input_idx){
+        options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
+    }
+
     kernel kernel = k.compile(context, options);
 
     kernel.set_arg(count_arg, static_cast<uint_>(count));
-    kernel.set_arg(output_arg, result.get_buffer());
-    kernel.set_arg(output_idx_arg, result_idx.get_buffer());
     kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
     kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
 
@@ -180,104 +223,114 @@ inline size_t find_extrema_with_reduce(InputIterator first,
                                   0,
                                   work_groups_no * work_group_size,
                                   work_group_size);
+}
 
-    return 0;
+template<class InputIterator, class ResultIterator, class Compare>
+inline void find_extrema_with_reduce(InputIterator input,
+                                     size_t count,
+                                     ResultIterator result,
+                                     vector<uint_>::iterator result_idx,
+                                     size_t work_groups_no,
+                                     size_t work_group_size,
+                                     Compare compare,
+                                     const bool find_minimum,
+                                     command_queue &queue)
+{
+    // dummy will not be used
+    buffer_iterator<uint_> dummy = result_idx;
+    return find_extrema_with_reduce(
+        input, dummy, count, result, result_idx, work_groups_no,
+        work_group_size, compare, find_minimum, false, queue
+    );
 }
 
 template<class InputIterator, class Compare>
-uint_ find_extrema_final(InputIterator candidates,
-                         vector<uint_>::iterator candidates_idx,
-                         const size_t count,
-                         Compare compare,
-                         const bool find_minimum,
-                         const size_t work_group_size,
-                         command_queue &queue)
+InputIterator find_extrema_with_reduce(InputIterator first,
+                                       InputIterator last,
+                                       Compare compare,
+                                       const bool find_minimum,
+                                       command_queue &queue)
 {
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
 
     const context &context = queue.get_context();
+    const device &device = queue.get_device();
 
-    // device vectors for the result
-    vector<input_type> result(1, context);
-    vector<uint_> result_idx(1, context);
+    // Getting information about used queue and device
+    const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
 
-    // get extremum from among the candidates
-    find_extrema_with_reduce(
-        candidates, count, result.begin(), result_idx.begin(),
-        1, work_group_size, compare, find_minimum, queue
-    );
+    const size_t count = detail::iterator_range_size(first, last);
 
-    // get candidate index
-    const uint_ idx = (result_idx.begin()).read(queue);
-    // get extremum index
-    typename vector<uint_>::iterator extremum_idx = candidates_idx + idx;
+    std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+        + type_name<input_type>();
 
-    // return extremum index
-    return extremum_idx.read(queue);
-}
+    // load parameters
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
 
-template<class InputIterator>
-uint_ find_extrema_final(InputIterator candidates,
-                         vector<uint_>::iterator candidates_idx,
-                         const size_t count,
-                         ::boost::compute::less<
-                             typename std::iterator_traits<InputIterator>::value_type
-                         > compare,
-                         const bool find_minimum,
-                         const size_t work_group_size,
-                         command_queue &queue)
-{
-    (void) work_group_size;
+    // get preferred work group size and preferred number
+    // of work groups per compute unit
+    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+    size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
 
-    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
-    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+    // calculate work group size and number of work groups
+    work_group_size = (std::min)(max_work_group_size, work_group_size);
+    size_t work_groups_no = compute_units_no * work_groups_per_cu;
+    work_groups_no = (std::min)(
+        work_groups_no,
+        static_cast<size_t>(std::ceil(float(count) / work_group_size))
+    );
 
-    // host vectors
-    std::vector<input_type> host_candidates(count);
-    std::vector<uint_> host_candidates_idx(count);
+    // phase I: finding candidates for extremum
 
-    InputIterator candidates_last =
-        candidates + static_cast<difference_type>(count);
-    vector<uint_>::iterator candidates_idx_last =
-        candidates_idx + count;
+    // device buffors for extremum candidates and their indices
+    // each work-group computes its candidate
+    vector<input_type> candidates(work_groups_no, context);
+    vector<uint_> candidates_idx(work_groups_no, context);
 
-    // copying extremum candidates found by find_extrema_reduce(...) to host
-    ::boost::compute::copy(candidates_idx, candidates_idx_last,
-                           host_candidates_idx.begin(), queue);
-    ::boost::compute::copy(candidates, candidates_last,
-                           host_candidates.begin(), queue);
+    // finding candidates for first extremum and their indices
+    find_extrema_with_reduce(
+        first, count, candidates.begin(), candidates_idx.begin(),
+        work_groups_no, work_group_size, compare, find_minimum, queue
+    );
 
-    typename std::vector<input_type>::iterator i = host_candidates.begin();
-    std::vector<uint_>::iterator idx = host_candidates_idx.begin();
-    std::vector<uint_>::iterator extremum_idx = idx;
-    input_type extremum = *i;
+    // phase II: finding extremum from among the candidates
 
-    // find extremum from among the candidates
-    if(!find_minimum) {
-        while(idx != host_candidates_idx.end()) {
-            bool compare_result =  *i > extremum;
-            extremum = compare_result ? *i : extremum;
-            extremum_idx = compare_result ? idx : extremum_idx;
-            idx++, i++;
-        }
-    }
-    else {
-        while(idx != host_candidates_idx.end()) {
-            bool compare_result =  *i < extremum;
-            extremum = compare_result ? *i : extremum;
-            extremum_idx = compare_result ? idx : extremum_idx;
-            idx++, i++;
-        }
-    }
+    // zero-copy buffers for final result (value and index)
+    vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+        result(1, context);
+    vector<uint_, ::boost::compute::pinned_allocator<uint_> >
+        result_idx(1, context);
 
-    // return extremum index
-    return (*extremum_idx);
+    // get extremum from among the candidates
+    find_extrema_with_reduce(
+        candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
+        result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
+    );
+
+    // mapping extremum index to host
+    uint_* result_idx_host_ptr =
+        static_cast<uint_*>(
+            queue.enqueue_map_buffer(
+                result_idx.get_buffer(), command_queue::map_read,
+                0, sizeof(uint_)
+            )
+        );
+
+    return first + static_cast<difference_type>(*result_idx_host_ptr);
 }
 
-template<class InputIterator, class Compare>
+template<class InputIterator>
 InputIterator find_extrema_with_reduce(InputIterator first,
                                        InputIterator last,
-                                       Compare compare,
+                                       ::boost::compute::less<
+                                           typename std::iterator_traits<
+                                               InputIterator
+                                           >::value_type
+                                       >
+                                       compare,
                                        const bool find_minimum,
                                        command_queue &queue)
 {
@@ -309,26 +362,76 @@ InputIterator find_extrema_with_reduce(InputIterator first,
     work_group_size = (std::min)(max_work_group_size, work_group_size);
     size_t work_groups_no = compute_units_no * work_groups_per_cu;
     work_groups_no = (std::min)(
-            work_groups_no,
-            static_cast<size_t>(std::ceil(float(count) / work_group_size)));
+        work_groups_no,
+        static_cast<size_t>(std::ceil(float(count) / work_group_size))
+    );
 
-    // device vectors for extremum candidates and their indices
-    vector<input_type> candidates(work_groups_no, context);
-    vector<uint_> candidates_idx(work_groups_no, context);
+    // phase I: finding candidates for extremum
+
+    // device buffors for extremum candidates and their indices
+    // each work-group computes its candidate
+    // zero-copy buffers are used to eliminate copying data back to host
+    vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+        candidates(work_groups_no, context);
+    vector<uint_, ::boost::compute::pinned_allocator <uint_> >
+        candidates_idx(work_groups_no, context);
 
-    // find extremum candidates and their indices
+    // finding candidates for first extremum and their indices
     find_extrema_with_reduce(
         first, count, candidates.begin(), candidates_idx.begin(),
         work_groups_no, work_group_size, compare, find_minimum, queue
-     );
-
-    // get extremum index
-    const uint_ extremum_idx = find_extrema_final(
-        candidates.begin(), candidates_idx.begin(), work_groups_no, compare,
-        find_minimum, work_group_size, queue
     );
 
-    return first + static_cast<difference_type>(extremum_idx);
+    // phase II: finding extremum from among the candidates
+
+    // mapping candidates and their indices to host
+    input_type* candidates_host_ptr =
+        static_cast<input_type*>(
+            queue.enqueue_map_buffer(
+                candidates.get_buffer(), command_queue::map_read,
+                0, work_groups_no * sizeof(input_type)
+            )
+        );
+
+    uint_* candidates_idx_host_ptr =
+        static_cast<uint_*>(
+            queue.enqueue_map_buffer(
+                candidates_idx.get_buffer(), command_queue::map_read,
+                0, work_groups_no * sizeof(uint_)
+            )
+        );
+
+    input_type* i = candidates_host_ptr;
+    uint_* idx = candidates_idx_host_ptr;
+    uint_* extremum_idx = idx;
+    input_type extremum = *candidates_host_ptr;
+    i++; idx++;
+
+    // find extremum (serial) from among the candidates on host
+    if(!find_minimum) {
+        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+            input_type next = *i;
+            bool compare_result =  next > extremum;
+            bool equal = next == extremum;
+            extremum = compare_result ? next : extremum;
+            extremum_idx = compare_result ? idx : extremum_idx;
+            extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+            idx++, i++;
+        }
+    }
+    else {
+        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+            input_type next = *i;
+            bool compare_result = next < extremum;
+            bool equal = next == extremum;
+            extremum = compare_result ? next : extremum;
+            extremum_idx = compare_result ? idx : extremum_idx;
+            extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+            idx++, i++;
+        }
+    }
+
+    return first + static_cast<difference_type>(*extremum_idx);
 }
 
 } // end detail namespace

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/compute.git



More information about the debian-science-commits mailing list