[compute] 26/49: Checking multiple values per thread in find_if_with_atomics()

Fri Dec 18 17:58:18 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository compute.

commit 75a42646a1690f9a12d2d1c27c2a4e165fa67cd9
Author: Jakub Szuppe <j.szuppe at gmail.com>
Date:   Sat Aug 8 20:52:52 2015 +0200

    Checking multiple values per thread in find_if_with_atomics()
---
 .../algorithm/detail/find_if_with_atomics.hpp      | 158 +++++++++++++++++++--
 1 file changed, 148 insertions(+), 10 deletions(-)

diff --git a/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp b/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp
index d79ef5b..8f5948e 100644
--- a/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp
+++ b/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp
@@ -21,25 +21,22 @@
 #include <boost/compute/type_traits/type_name.hpp>
 #include <boost/compute/detail/meta_kernel.hpp>
 #include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
 
 namespace boost {
 namespace compute {
 namespace detail {
 
 template<class InputIterator, class UnaryPredicate>
-inline InputIterator find_if_with_atomics(InputIterator first,
-                                          InputIterator last,
-                                          UnaryPredicate predicate,
-                                          command_queue &queue)
+inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
+                                                  InputIterator last,
+                                                  UnaryPredicate predicate,
+                                                  const size_t count,
+                                                  command_queue &queue)
 {
     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
 
-    size_t count = detail::iterator_range_size(first, last);
-    if(count == 0){
-        return last;
-    }
-
     const context &context = queue.get_context();
 
     detail::meta_kernel k("find_if");
@@ -60,13 +57,154 @@ inline InputIterator find_if_with_atomics(InputIterator first,
 
     // initialize index to the last iterator's index
     index.write(static_cast<uint_>(count), queue);
-
     queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
 
     // read index and return iterator
     return first + static_cast<difference_type>(index.read(queue));
 }
 
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
+                                                       InputIterator last,
+                                                       UnaryPredicate predicate,
+                                                       const size_t count,
+                                                       const uint_ vpt,
+                                                       command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    detail::meta_kernel k("find_if");
+    size_t index_arg = k.add_arg<uint *>(memory_object::global_memory, "index");
+    size_t count_arg = k.add_arg<const uint>("count");
+    size_t vpt_arg = k.add_arg<const uint>("vpt");
+    atomic_min<uint_> atomic_min_uint;
+
+    // for GPUs reads from global memory are coalesced
+    if(device.type() & device::gpu) {
+        k <<
+            k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
+            k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
+            k.decl<const uint_>("end") << " = min(" <<
+                    "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
+                    "count" <<
+            ");\n" <<
+
+            // checking if the index is already found
+            "__local uint local_index;\n" <<
+            "if(get_local_id(0) == 0){\n" <<
+            "    local_index = *index;\n " <<
+            "};\n" <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "if(local_index < id){\n" <<
+            "    return;\n" <<
+            "}\n" <<
+
+            "while(id < end){\n" <<
+            "    " << k.decl<const value_type>("value") << " = " <<
+                      first[k.var<const uint_>("id")] << ";\n"
+            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+            "        " << atomic_min_uint(k.var<uint_ *>("index"),
+                                          k.var<uint_>("id")) << ";\n" <<
+            "        return;\n"
+            "    }\n" <<
+            "    id+=lsize;\n" <<
+            "}\n";
+    // for CPUs (and other devices) reads are ordered so the big cache is
+    // efficiently used.
+    } else {
+        k <<
+            k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
+            k.decl<const uint_>("end") << " = min(" <<
+                    "id + " << k.var<uint_>("vpt") << "," <<
+                    "count" <<
+            ");\n" <<
+            "while(id < end && (*index) > id){\n" <<
+            "    " << k.decl<const value_type>("value") << " = " <<
+                      first[k.var<const uint_>("id")] << ";\n"
+            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+            "        " << atomic_min_uint(k.var<uint_ *>("index"),
+                                          k.var<uint_>("id")) << ";\n" <<
+            "        return;\n" <<
+            "    }\n" <<
+            "    id++;\n" <<
+            "}\n";
+    }
+
+    kernel kernel = k.compile(context);
+
+    scalar<uint_> index(context);
+    kernel.set_arg(index_arg, index.get_buffer());
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
+
+    // initialize index to the last iterator's index
+    index.write(static_cast<uint_>(count), queue);
+
+    const size_t global_wg_size = static_cast<size_t>(
+        std::ceil(float(count) / vpt)
+    );
+    queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
+
+    // read index and return iterator
+    return first + static_cast<difference_type>(index.read(queue));
+}
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics(InputIterator first,
+                                          InputIterator last,
+                                          UnaryPredicate predicate,
+                                          command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+    size_t count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        return last;
+    }
+
+    const device &device = queue.get_device();
+
+    // load cached parameters
+    std::string cache_key = std::string("__boost_find_if_with_atomics_")
+        + type_name<value_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // for relatively small inputs on GPUs kernel checking one value per thread
+    // (work-item) is more efficient than its multiple values per thread version
+    if(device.type() & device::gpu){
+        const size_t one_vpt_threshold =
+            parameters->get(cache_key, "one_vpt_threshold", 1048576);
+        if(count <= one_vpt_threshold){
+            return find_if_with_atomics_one_vpt(
+                first, last, predicate, count, queue
+            );
+        }
+    }
+
+    // values per thread
+    size_t vpt;
+    if(device.type() & device::gpu){
+        // get vpt parameter
+        vpt = parameters->get(cache_key, "vpt", 32);
+    } else {
+        // for CPUs work is split equally between compute units
+        const size_t max_compute_units =
+            device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+        vpt = static_cast<size_t>(
+            std::ceil(float(count) / max_compute_units)
+        );
+    }
+
+    return find_if_with_atomics_multiple_vpt(
+        first, last, predicate, count, vpt, queue
+    );
+}
+
 } // end detail namespace
 } // end compute namespace
 } // end boost namespace

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/compute.git