[python-dtcwt] 164/497: opencl: remove requirement that kernel vectors be on host

Tue Jul 21 18:06:00 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository python-dtcwt.

commit 19d78e5846c096080be9ab6b719fb5fd1685be5a
Author: Rich Wareham <rjw57 at cam.ac.uk>
Date:   Fri Nov 8 15:22:51 2013 +0000

    opencl: remove requirement that kernel vectors be on host
    
    The flip_output flag was computed on the hos which required copying all
    of the filter kernel. Compute this flag on the device.
---
 dtcwt/opencl/lowlevel.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/dtcwt/opencl/lowlevel.py b/dtcwt/opencl/lowlevel.py
index 808087b..64544f0 100644
--- a/dtcwt/opencl/lowlevel.py
+++ b/dtcwt/opencl/lowlevel.py
@@ -275,9 +275,7 @@ def axis_convolve_dfilter(X, h, axis=0, queue=None, output=None):
         output_shape[axis] >>= 1
         output = cl_array.zeros(queue, output_shape, np.float32)
 
-    flip_output = np.dot(h.flat, h.flat[::-1]) > 0
-
-    return _apply_kernel(X, h, kern, output, axis=axis, elementstep=2, extra_kernel_args=[np.int32(flip_output),])
+    return _apply_kernel(X, h, kern, output, axis=axis, elementstep=2)
 
 def axis_convolve_ifilter(X, h, axis=0, queue=None, output=None):
     _check_cl()
@@ -290,9 +288,7 @@ def axis_convolve_ifilter(X, h, axis=0, queue=None, output=None):
         output_shape[axis] <<= 1
         output = cl_array.zeros(queue, output_shape, np.float32)
 
-    flip_output = np.dot(h.flat, h.flat[::-1]) > 0
-
-    return _apply_kernel(X, h, kern, output, axis=axis, elementstep=0.5, extra_kernel_args=[np.int32(flip_output),])
+    return _apply_kernel(X, h, kern, output, axis=axis, elementstep=0.5)
 
 @memoize
 def _convolve_kernel_for_queue(context):
@@ -390,7 +386,7 @@ void __kernel convolve_kernel(
     const __global float* X, int4 X_strides, int4 X_shape, int X_offset,
     const __global float* h, int h_stride, int h_shape, int h_offset,
     __global float* Y, int4 Y_strides, int4 Y_shape, int Y_offset,
-    int axis, int flip_output)
+    int axis)
 {
     int4 global_coord = { get_global_id(0), get_global_id(1), get_global_id(2), 0 };
     struct array_spec X_spec = { .strides = X_strides, .shape = X_shape, .offset = X_offset };
@@ -419,6 +415,12 @@ void __kernel convolve_kernel(
     float2 output = { 0, 0 };
     int4 offsets = { 1, 0, 3, 2 };
 
+    float ha_dot_hb = 0.f;
+    for(int i=0; i<h_shape; ++i) {
+        ha_dot_hb += h[h_offset + i*h_stride] * h[h_offset + (h_shape - 1 - i)*h_stride];
+    }
+    bool flip_output = ha_dot_hb > 0.f;
+
     int m = h_shape>>1;
     for(int d=0; d<m; ++d) {
         int X_offset = 4*((m>>1)-d);
@@ -456,7 +458,7 @@ void __kernel convolve_kernel(
     const __global float* X, int4 X_strides, int4 X_shape, int X_offset,
     const __global float* h, int h_stride, int h_shape, int h_offset,
     __global float* Y, int4 Y_strides, int4 Y_shape, int Y_offset,
-    int axis, int flip_output)
+    int axis)
 {
     int4 global_coord = { get_global_id(0), get_global_id(1), get_global_id(2), 0 };
     struct array_spec X_spec = { .strides = X_strides, .shape = X_shape, .offset = X_offset };
@@ -480,6 +482,12 @@ void __kernel convolve_kernel(
 
     float4 output = { 0, 0, 0, 0 };
 
+    float ha_dot_hb = 0.f;
+    for(int i=0; i<h_shape; ++i) {
+        ha_dot_hb += h[h_offset + i*h_stride] * h[h_offset + (h_shape - 1 - i)*h_stride];
+    }
+    bool flip_output = ha_dot_hb > 0.f;
+
     int m = h_shape>>1;
     int4 offsets = (m % 2 == 0) ? (int4)(-1,-2,1,0) : (int4)(1,0,1,0);
     for(int d=0; d<m; ++d) {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-dtcwt.git