[compute] 26/46: Fix find_extrema_with_reduce
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Dec 21 18:28:42 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository compute.
commit 2d972fe2a4b7318e40db9451ea93cfa15808476b
Author: Jakub Szuppe <j.szuppe at gmail.com>
Date: Thu Sep 24 20:29:25 2015 +0200
Fix find_extrema_with_reduce
Now find_extrema_with_reduce always return the first extremum
in the given range.
---
.../algorithm/detail/find_extrema_with_reduce.hpp | 345 +++++++++++++--------
1 file changed, 224 insertions(+), 121 deletions(-)
diff --git a/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
index 55fb688..a157e24 100644
--- a/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
+++ b/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
@@ -12,11 +12,11 @@
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
#include <algorithm>
-#include <vector>
#include <boost/compute/types.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/allocator/pinned_allocator.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
@@ -24,7 +24,6 @@
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/utility/program_cache.hpp>
-#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
namespace boost {
namespace compute {
@@ -73,16 +72,24 @@ bool find_extrema_with_reduce_requirements_met(InputIterator first,
return ((required_local_mem_size * 4) <= local_mem_size);
}
+/// \internal_
+/// Algorithm finds the first extremum in given range, i.e., with the lowest
+/// index.
+///
+/// If \p use_input_idx is false, it's assumed that input data is ordered by
+/// increasing index and \p input_idx is not used in the algorithm.
template<class InputIterator, class ResultIterator, class Compare>
-inline size_t find_extrema_with_reduce(InputIterator first,
- size_t count,
- ResultIterator result,
- vector<uint_>::iterator result_idx,
- size_t work_groups_no,
- size_t work_group_size,
- Compare compare,
- const bool find_minimum,
- command_queue &queue)
+inline void find_extrema_with_reduce(InputIterator input,
+ vector<uint_>::iterator input_idx,
+ size_t count,
+ ResultIterator result,
+ vector<uint_>::iterator result_idx,
+ size_t work_groups_no,
+ size_t work_group_size,
+ Compare compare,
+ const bool find_minimum,
+ const bool use_input_idx,
+ command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
@@ -90,44 +97,68 @@ inline size_t find_extrema_with_reduce(InputIterator first,
meta_kernel k("find_extrema_reduce");
size_t count_arg = k.add_arg<uint_>("count");
- size_t output_arg = k.add_arg<input_type *>(memory_object::global_memory, "output");
- size_t output_idx_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output_idx");
size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
k <<
// Work item global id
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
- //
- "if(gid >= count) {\n return;\n }\n" <<
+ "if(gid >= count) {\n return;\n}\n" <<
// Index of element that will be read from input buffer
k.decl<uint_>("idx") << " = gid;\n" <<
k.decl<input_type>("acc") << ";\n" <<
- // Index of currently best element
+ // Real index of currently best element
+ "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ k.decl<input_type>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+ "#else\n" <<
k.decl<uint_>("acc_idx") << " = idx;\n" <<
+ "#endif\n" <<
// Init accumulator with first[get_global_id(0)]
- "acc = " << first[k.var<uint_>("idx")] << ";\n" <<
+ "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
"idx += get_global_size(0);\n" <<
k.decl<bool>("compare_result") << ";\n" <<
+ k.decl<bool>("equal") << ";\n\n" <<
"while( idx < count ){\n" <<
// Next element
- k.decl<input_type>("next") << " = " << first[k.var<uint_>("idx")] << ";\n" <<
+ k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
+ "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+ "#endif\n" <<
+
// Comparison between currently best element (acc) and next element
- "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
- "compare_result = " << compare(k.var<input_type>("acc"),
- k.var<input_type>("next")) << ";\n" <<
- "#else\n" <<
+ "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
"compare_result = " << compare(k.var<input_type>("next"),
k.var<input_type>("acc")) << ";\n" <<
+ "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("acc"),
+ k.var<input_type>("next")) << ";\n" <<
+ "# endif\n" <<
+ "#else\n" <<
+ "compare_result = " << compare(k.var<input_type>("acc"),
+ k.var<input_type>("next")) << ";\n" <<
+ "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("next"),
+ k.var<input_type>("acc")) << ";\n" <<
+ "# endif\n" <<
"#endif\n" <<
+
+ // save the winner
"acc = compare_result ? acc : next;\n" <<
+ "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ "acc_idx = compare_result ? " <<
+ "acc_idx : " <<
+ "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
+ "#else\n" <<
"acc_idx = compare_result ? acc_idx : idx;\n" <<
+ "#endif\n" <<
"idx += get_global_size(0);\n" <<
- "}\n" <<
+ "}\n\n" <<
// Work item local id
k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
@@ -135,7 +166,8 @@ inline size_t find_extrema_with_reduce(InputIterator first,
"block_idx[lid] = acc_idx;\n" <<
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
- k.decl<uint_>("group_offset") << " = count - (get_local_size(0) * get_group_id(0));\n";
+ k.decl<uint_>("group_offset") <<
+ " = count - (get_local_size(0) * get_group_id(0));\n\n";
k <<
"#pragma unroll\n"
@@ -144,35 +176,46 @@ inline size_t find_extrema_with_reduce(InputIterator first,
"if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
k.decl<input_type>("mine") << " = block[lid];\n" <<
k.decl<input_type>("other") << " = block[lid+offset];\n" <<
- "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
- "compare_result = " << compare(k.var<input_type>("mine"),
- k.var<input_type>("other")) << ";\n" <<
- "#else\n" <<
+ "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
"compare_result = " << compare(k.var<input_type>("other"),
k.var<input_type>("mine")) << ";\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("mine"),
+ k.var<input_type>("other")) << ";\n" <<
+ "#else\n" <<
+ "compare_result = " << compare(k.var<input_type>("mine"),
+ k.var<input_type>("other")) << ";\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("other"),
+ k.var<input_type>("mine")) << ";\n" <<
"#endif\n" <<
"block[lid] = compare_result ? mine : other;\n" <<
+ k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
+ k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
"block_idx[lid] = compare_result ? " <<
- "block_idx[lid] : block_idx[lid+offset];\n" <<
+ "mine_idx : " <<
+ "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
- "}\n" <<
+ "}\n\n" <<
// write block result to global output
"if(lid == 0){\n" <<
- " output[get_group_id(0)] = block[0];\n" <<
- " output_idx[get_group_id(0)] = block_idx[0];\n" <<
+ result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
+ result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
"}";
std::string options;
if(!find_minimum){
options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
}
+ if(use_input_idx){
+ options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
+ }
+
kernel kernel = k.compile(context, options);
kernel.set_arg(count_arg, static_cast<uint_>(count));
- kernel.set_arg(output_arg, result.get_buffer());
- kernel.set_arg(output_idx_arg, result_idx.get_buffer());
kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
@@ -180,104 +223,114 @@ inline size_t find_extrema_with_reduce(InputIterator first,
0,
work_groups_no * work_group_size,
work_group_size);
+}
- return 0;
+template<class InputIterator, class ResultIterator, class Compare>
+inline void find_extrema_with_reduce(InputIterator input,
+ size_t count,
+ ResultIterator result,
+ vector<uint_>::iterator result_idx,
+ size_t work_groups_no,
+ size_t work_group_size,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ // dummy will not be used
+ buffer_iterator<uint_> dummy = result_idx;
+ return find_extrema_with_reduce(
+ input, dummy, count, result, result_idx, work_groups_no,
+ work_group_size, compare, find_minimum, false, queue
+ );
}
template<class InputIterator, class Compare>
-uint_ find_extrema_final(InputIterator candidates,
- vector<uint_>::iterator candidates_idx,
- const size_t count,
- Compare compare,
- const bool find_minimum,
- const size_t work_group_size,
- command_queue &queue)
+InputIterator find_extrema_with_reduce(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
{
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const context &context = queue.get_context();
+ const device &device = queue.get_device();
- // device vectors for the result
- vector<input_type> result(1, context);
- vector<uint_> result_idx(1, context);
+ // Getting information about used queue and device
+ const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+ const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
- // get extremum from among the candidates
- find_extrema_with_reduce(
- candidates, count, result.begin(), result_idx.begin(),
- 1, work_group_size, compare, find_minimum, queue
- );
+ const size_t count = detail::iterator_range_size(first, last);
- // get candidate index
- const uint_ idx = (result_idx.begin()).read(queue);
- // get extremum index
- typename vector<uint_>::iterator extremum_idx = candidates_idx + idx;
+ std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+ + type_name<input_type>();
- // return extremum index
- return extremum_idx.read(queue);
-}
+ // load parameters
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
-template<class InputIterator>
-uint_ find_extrema_final(InputIterator candidates,
- vector<uint_>::iterator candidates_idx,
- const size_t count,
- ::boost::compute::less<
- typename std::iterator_traits<InputIterator>::value_type
- > compare,
- const bool find_minimum,
- const size_t work_group_size,
- command_queue &queue)
-{
- (void) work_group_size;
+ // get preferred work group size and preferred number
+ // of work groups per compute unit
+ size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+ size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
- typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
- typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+ // calculate work group size and number of work groups
+ work_group_size = (std::min)(max_work_group_size, work_group_size);
+ size_t work_groups_no = compute_units_no * work_groups_per_cu;
+ work_groups_no = (std::min)(
+ work_groups_no,
+ static_cast<size_t>(std::ceil(float(count) / work_group_size))
+ );
- // host vectors
- std::vector<input_type> host_candidates(count);
- std::vector<uint_> host_candidates_idx(count);
+ // phase I: finding candidates for extremum
- InputIterator candidates_last =
- candidates + static_cast<difference_type>(count);
- vector<uint_>::iterator candidates_idx_last =
- candidates_idx + count;
+ // device buffors for extremum candidates and their indices
+ // each work-group computes its candidate
+ vector<input_type> candidates(work_groups_no, context);
+ vector<uint_> candidates_idx(work_groups_no, context);
- // copying extremum candidates found by find_extrema_reduce(...) to host
- ::boost::compute::copy(candidates_idx, candidates_idx_last,
- host_candidates_idx.begin(), queue);
- ::boost::compute::copy(candidates, candidates_last,
- host_candidates.begin(), queue);
+ // finding candidates for first extremum and their indices
+ find_extrema_with_reduce(
+ first, count, candidates.begin(), candidates_idx.begin(),
+ work_groups_no, work_group_size, compare, find_minimum, queue
+ );
- typename std::vector<input_type>::iterator i = host_candidates.begin();
- std::vector<uint_>::iterator idx = host_candidates_idx.begin();
- std::vector<uint_>::iterator extremum_idx = idx;
- input_type extremum = *i;
+ // phase II: finding extremum from among the candidates
- // find extremum from among the candidates
- if(!find_minimum) {
- while(idx != host_candidates_idx.end()) {
- bool compare_result = *i > extremum;
- extremum = compare_result ? *i : extremum;
- extremum_idx = compare_result ? idx : extremum_idx;
- idx++, i++;
- }
- }
- else {
- while(idx != host_candidates_idx.end()) {
- bool compare_result = *i < extremum;
- extremum = compare_result ? *i : extremum;
- extremum_idx = compare_result ? idx : extremum_idx;
- idx++, i++;
- }
- }
+ // zero-copy buffers for final result (value and index)
+ vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+ result(1, context);
+ vector<uint_, ::boost::compute::pinned_allocator<uint_> >
+ result_idx(1, context);
- // return extremum index
- return (*extremum_idx);
+ // get extremum from among the candidates
+ find_extrema_with_reduce(
+ candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
+ result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
+ );
+
+ // mapping extremum index to host
+ uint_* result_idx_host_ptr =
+ static_cast<uint_*>(
+ queue.enqueue_map_buffer(
+ result_idx.get_buffer(), command_queue::map_read,
+ 0, sizeof(uint_)
+ )
+ );
+
+ return first + static_cast<difference_type>(*result_idx_host_ptr);
}
-template<class InputIterator, class Compare>
+template<class InputIterator>
InputIterator find_extrema_with_reduce(InputIterator first,
InputIterator last,
- Compare compare,
+ ::boost::compute::less<
+ typename std::iterator_traits<
+ InputIterator
+ >::value_type
+ >
+ compare,
const bool find_minimum,
command_queue &queue)
{
@@ -309,26 +362,76 @@ InputIterator find_extrema_with_reduce(InputIterator first,
work_group_size = (std::min)(max_work_group_size, work_group_size);
size_t work_groups_no = compute_units_no * work_groups_per_cu;
work_groups_no = (std::min)(
- work_groups_no,
- static_cast<size_t>(std::ceil(float(count) / work_group_size)));
+ work_groups_no,
+ static_cast<size_t>(std::ceil(float(count) / work_group_size))
+ );
- // device vectors for extremum candidates and their indices
- vector<input_type> candidates(work_groups_no, context);
- vector<uint_> candidates_idx(work_groups_no, context);
+ // phase I: finding candidates for extremum
+
+ // device buffors for extremum candidates and their indices
+ // each work-group computes its candidate
+ // zero-copy buffers are used to eliminate copying data back to host
+ vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+ candidates(work_groups_no, context);
+ vector<uint_, ::boost::compute::pinned_allocator <uint_> >
+ candidates_idx(work_groups_no, context);
- // find extremum candidates and their indices
+ // finding candidates for first extremum and their indices
find_extrema_with_reduce(
first, count, candidates.begin(), candidates_idx.begin(),
work_groups_no, work_group_size, compare, find_minimum, queue
- );
-
- // get extremum index
- const uint_ extremum_idx = find_extrema_final(
- candidates.begin(), candidates_idx.begin(), work_groups_no, compare,
- find_minimum, work_group_size, queue
);
- return first + static_cast<difference_type>(extremum_idx);
+ // phase II: finding extremum from among the candidates
+
+ // mapping candidates and their indices to host
+ input_type* candidates_host_ptr =
+ static_cast<input_type*>(
+ queue.enqueue_map_buffer(
+ candidates.get_buffer(), command_queue::map_read,
+ 0, work_groups_no * sizeof(input_type)
+ )
+ );
+
+ uint_* candidates_idx_host_ptr =
+ static_cast<uint_*>(
+ queue.enqueue_map_buffer(
+ candidates_idx.get_buffer(), command_queue::map_read,
+ 0, work_groups_no * sizeof(uint_)
+ )
+ );
+
+ input_type* i = candidates_host_ptr;
+ uint_* idx = candidates_idx_host_ptr;
+ uint_* extremum_idx = idx;
+ input_type extremum = *candidates_host_ptr;
+ i++; idx++;
+
+ // find extremum (serial) from among the candidates on host
+ if(!find_minimum) {
+ while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+ input_type next = *i;
+ bool compare_result = next > extremum;
+ bool equal = next == extremum;
+ extremum = compare_result ? next : extremum;
+ extremum_idx = compare_result ? idx : extremum_idx;
+ extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+ idx++, i++;
+ }
+ }
+ else {
+ while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+ input_type next = *i;
+ bool compare_result = next < extremum;
+ bool equal = next == extremum;
+ extremum = compare_result ? next : extremum;
+ extremum_idx = compare_result ? idx : extremum_idx;
+ extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+ idx++, i++;
+ }
+ }
+
+ return first + static_cast<difference_type>(*extremum_idx);
}
} // end detail namespace
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/compute.git
More information about the debian-science-commits
mailing list