[compute] 26/49: Checking multiple values per thread in find_if_with_atomics()
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Fri Dec 18 17:58:18 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository compute.
commit 75a42646a1690f9a12d2d1c27c2a4e165fa67cd9
Author: Jakub Szuppe <j.szuppe at gmail.com>
Date: Sat Aug 8 20:52:52 2015 +0200
Checking multiple values per thread in find_if_with_atomics()
---
.../algorithm/detail/find_if_with_atomics.hpp | 158 +++++++++++++++++++--
1 file changed, 148 insertions(+), 10 deletions(-)
diff --git a/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp b/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp
index d79ef5b..8f5948e 100644
--- a/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp
+++ b/include/boost/compute/algorithm/detail/find_if_with_atomics.hpp
@@ -21,25 +21,22 @@
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class UnaryPredicate>
-inline InputIterator find_if_with_atomics(InputIterator first,
- InputIterator last,
- UnaryPredicate predicate,
- command_queue &queue)
+inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ const size_t count,
+ command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
- size_t count = detail::iterator_range_size(first, last);
- if(count == 0){
- return last;
- }
-
const context &context = queue.get_context();
detail::meta_kernel k("find_if");
@@ -60,13 +57,154 @@ inline InputIterator find_if_with_atomics(InputIterator first,
// initialize index to the last iterator's index
index.write(static_cast<uint_>(count), queue);
-
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
// read index and return iterator
return first + static_cast<difference_type>(index.read(queue));
}
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ const size_t count,
+ const uint_ vpt,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+
+ detail::meta_kernel k("find_if");
+ size_t index_arg = k.add_arg<uint *>(memory_object::global_memory, "index");
+ size_t count_arg = k.add_arg<const uint>("count");
+ size_t vpt_arg = k.add_arg<const uint>("vpt");
+ atomic_min<uint_> atomic_min_uint;
+
+ // for GPUs reads from global memory are coalesced
+ if(device.type() & device::gpu) {
+ k <<
+ k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
+ k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
+ k.decl<const uint_>("end") << " = min(" <<
+ "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
+ "count" <<
+ ");\n" <<
+
+ // checking if the index is already found
+ "__local uint local_index;\n" <<
+ "if(get_local_id(0) == 0){\n" <<
+ " local_index = *index;\n " <<
+ "};\n" <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "if(local_index < id){\n" <<
+ " return;\n" <<
+ "}\n" <<
+
+ "while(id < end){\n" <<
+ " " << k.decl<const value_type>("value") << " = " <<
+ first[k.var<const uint_>("id")] << ";\n"
+ " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+ " " << atomic_min_uint(k.var<uint_ *>("index"),
+ k.var<uint_>("id")) << ";\n" <<
+ " return;\n"
+ " }\n" <<
+ " id+=lsize;\n" <<
+ "}\n";
+ // for CPUs (and other devices) reads are ordered so the big cache is
+ // efficiently used.
+ } else {
+ k <<
+ k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
+ k.decl<const uint_>("end") << " = min(" <<
+ "id + " << k.var<uint_>("vpt") << "," <<
+ "count" <<
+ ");\n" <<
+ "while(id < end && (*index) > id){\n" <<
+ " " << k.decl<const value_type>("value") << " = " <<
+ first[k.var<const uint_>("id")] << ";\n"
+ " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+ " " << atomic_min_uint(k.var<uint_ *>("index"),
+ k.var<uint_>("id")) << ";\n" <<
+ " return;\n" <<
+ " }\n" <<
+ " id++;\n" <<
+ "}\n";
+ }
+
+ kernel kernel = k.compile(context);
+
+ scalar<uint_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
+
+ // initialize index to the last iterator's index
+ index.write(static_cast<uint_>(count), queue);
+
+ const size_t global_wg_size = static_cast<size_t>(
+ std::ceil(float(count) / vpt)
+ );
+ queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
+
+ // read index and return iterator
+ return first + static_cast<difference_type>(index.read(queue));
+}
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return last;
+ }
+
+ const device &device = queue.get_device();
+
+ // load cached parameters
+ std::string cache_key = std::string("__boost_find_if_with_atomics_")
+ + type_name<value_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // for relatively small inputs on GPUs kernel checking one value per thread
+ // (work-item) is more efficient than its multiple values per thread version
+ if(device.type() & device::gpu){
+ const size_t one_vpt_threshold =
+ parameters->get(cache_key, "one_vpt_threshold", 1048576);
+ if(count <= one_vpt_threshold){
+ return find_if_with_atomics_one_vpt(
+ first, last, predicate, count, queue
+ );
+ }
+ }
+
+ // values per thread
+ size_t vpt;
+ if(device.type() & device::gpu){
+ // get vpt parameter
+ vpt = parameters->get(cache_key, "vpt", 32);
+ } else {
+ // for CPUs work is split equally between compute units
+ const size_t max_compute_units =
+ device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+ vpt = static_cast<size_t>(
+ std::ceil(float(count) / max_compute_units)
+ );
+ }
+
+ return find_if_with_atomics_multiple_vpt(
+ first, last, predicate, count, vpt, queue
+ );
+}
+
} // end detail namespace
} // end compute namespace
} // end boost namespace
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/compute.git
More information about the debian-science-commits
mailing list