[arrayfire] 36/61: Fixed and improved CUDA's homography

Tue Dec 8 11:55:07 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.

commit 9b0051147a2802866e8dcc07da83ce2d36633c8d
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date:   Tue Dec 1 16:31:08 2015 -0500

    Fixed and improved CUDA's homography
    
    * Added missing __syncthreads()
    * Removed need of some global memory arrays
---
 src/backend/cuda/homography.cu         |  4 +-
 src/backend/cuda/kernel/homography.hpp | 91 +++++++++++++++++++---------------
 2 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/src/backend/cuda/homography.cu b/src/backend/cuda/homography.cu
index a7a993a..e522e81 100644
--- a/src/backend/cuda/homography.cu
+++ b/src/backend/cuda/homography.cu
@@ -56,12 +56,10 @@ int homography(Array<T> &bestH,
     Array<float> rnd = arithOp<float, af_mul_t>(frnd, fctr, rdims);
 
     Array<T> tmpH = createValueArray<T>(af::dim4(9, iter), (T)0);
-    Array<T> tmpA = createValueArray<T>(af::dim4(9, 9, iter), (T)0);
-    Array<T> tmpV = createValueArray<T>(af::dim4(9, 9, iter), (T)0);
 
     bestH = createValueArray<T>(af::dim4(3, 3), (T)0);
 
-    return kernel::computeH<T>(bestH, tmpH, tmpA, tmpV, err,
+    return kernel::computeH<T>(bestH, tmpH, err,
                                x_src, y_src, x_dst, y_dst,
                                rnd, iter, nsamples, inlier_thr, htype);
 }
diff --git a/src/backend/cuda/kernel/homography.hpp b/src/backend/cuda/kernel/homography.hpp
index 8dd1794..65d880e 100644
--- a/src/backend/cuda/kernel/homography.hpp
+++ b/src/backend/cuda/kernel/homography.hpp
@@ -54,9 +54,10 @@ struct EPS<double>
 #define LMEDSConfidence 0.99f
 #define LMEDSOutlierRatio 0.4f
 
+extern __shared__ char sh[];
 
 template<typename T>
-__device__ void JacobiSVD(T* S, T* V, int m, int n)
+__device__ void JacobiSVD(int m, int n)
 {
     const int iterations = 30;
 
@@ -65,19 +66,13 @@ __device__ void JacobiSVD(T* S, T* V, int m, int n)
     int tid_y = threadIdx.y;
     int gid_y = blockIdx.y * blockDim.y + tid_y;
 
-    __shared__ T acc[512];
-    T* acc1 = acc;
-    T* acc2 = acc + 256;
+    __shared__ T acc1[256];
+    __shared__ T acc2[256];
 
-    __shared__ T s_S[16*81];
-    __shared__ T s_V[16*81];
     __shared__ T d[16*9];
 
-    for (int i = 0; i <= 4; i++)
-        s_S[tid_y * 81 + i*bsz_x + tid_x] = S[gid_y * 81 + i*bsz_x + tid_x];
-    if (tid_x == 0)
-        s_S[tid_y * 81 + 80] = S[gid_y * 81 + 80];
-    __syncthreads();
+    T* s_V = (T*)sh;
+    T* s_S = (T*)sh + 16*81;
 
     // Copy first 80 elements
     for (int i = 0; i <= 4; i++) {
@@ -194,12 +189,6 @@ __device__ void JacobiSVD(T* S, T* V, int m, int n)
         }
     }
     __syncthreads();
-
-    for (int i = 0; i <= 4; i++)
-        V[gid_y * 81 + tid_x+i*bsz_x] = s_V[tid_y * 81 + tid_x+i*bsz_x];
-    if (tid_x == 0)
-        V[gid_y * 81 + 80] = s_V[tid_y * 81 + 80];
-    __syncthreads();
 }
 
 __device__ bool computeMeanScale(
@@ -258,13 +247,11 @@ __device__ bool computeMeanScale(
     return !bad;
 }
 
-#define APTR(Z, Y, X) (A.ptr[(Z) * A.dims[0] * A.dims[1] + (Y) * A.dims[0] + (X)])
+#define SSPTR(Z, Y, X) (s_S[(Z) * 81 + (Y) * 9 + (X)])
 
 template<typename T>
 __global__ void buildLinearSystem(
     Param<T> H,
-    Param<T> A,
-    Param<T> V,
     CParam<float> x_src,
     CParam<float> y_src,
     CParam<float> x_dst,
@@ -272,7 +259,8 @@ __global__ void buildLinearSystem(
     CParam<float> rnd,
     const unsigned iterations)
 {
-    unsigned i = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned tid_y = threadIdx.y;
+    unsigned i = blockIdx.y * blockDim.y + tid_y;
 
     if (i < iterations) {
         float x_src_mean, y_src_mean;
@@ -288,6 +276,9 @@ __global__ void buildLinearSystem(
                          x_src, y_src, x_dst, y_dst,
                          rnd, i);
 
+        T* s_V = (T*)sh;
+        T* s_S = (T*)sh + 16*81;
+
         // Compute input matrix
         for (unsigned j = threadIdx.x; j < 4; j+=blockDim.x) {
             float srcx = (src_pt_x[j] - x_src_mean) * src_scale;
@@ -295,26 +286,45 @@ __global__ void buildLinearSystem(
             float dstx = (dst_pt_x[j] - x_dst_mean) * dst_scale;
             float dsty = (dst_pt_y[j] - y_dst_mean) * dst_scale;
 
-            APTR(i, 3, j*2) = -srcx;
-            APTR(i, 4, j*2) = -srcy;
-            APTR(i, 5, j*2) = -1.0f;
-            APTR(i, 6, j*2) = dsty*srcx;
-            APTR(i, 7, j*2) = dsty*srcy;
-            APTR(i, 8, j*2) = dsty;
-
-            APTR(i, 0, j*2+1) = srcx;
-            APTR(i, 1, j*2+1) = srcy;
-            APTR(i, 2, j*2+1) = 1.0f;
-            APTR(i, 6, j*2+1) = -dstx*srcx;
-            APTR(i, 7, j*2+1) = -dstx*srcy;
-            APTR(i, 8, j*2+1) = -dstx;
+            SSPTR(tid_y, 0, j*2) = 0.0f;
+            SSPTR(tid_y, 1, j*2) = 0.0f;
+            SSPTR(tid_y, 2, j*2) = 0.0f;
+            SSPTR(tid_y, 3, j*2) = -srcx;
+            SSPTR(tid_y, 4, j*2) = -srcy;
+            SSPTR(tid_y, 5, j*2) = -1.0f;
+            SSPTR(tid_y, 6, j*2) = dsty*srcx;
+            SSPTR(tid_y, 7, j*2) = dsty*srcy;
+            SSPTR(tid_y, 8, j*2) = dsty;
+
+            SSPTR(tid_y, 0, j*2+1) = srcx;
+            SSPTR(tid_y, 1, j*2+1) = srcy;
+            SSPTR(tid_y, 2, j*2+1) = 1.0f;
+            SSPTR(tid_y, 3, j*2+1) = 0.0f;
+            SSPTR(tid_y, 4, j*2+1) = 0.0f;
+            SSPTR(tid_y, 5, j*2+1) = 0.0f;
+            SSPTR(tid_y, 6, j*2+1) = -dstx*srcx;
+            SSPTR(tid_y, 7, j*2+1) = -dstx*srcy;
+            SSPTR(tid_y, 8, j*2+1) = -dstx;
+
+            if (j == 4) {
+                SSPTR(tid_y, 0, 8) = 0.0f;
+                SSPTR(tid_y, 1, 8) = 0.0f;
+                SSPTR(tid_y, 2, 8) = 0.0f;
+                SSPTR(tid_y, 3, 8) = 0.0f;
+                SSPTR(tid_y, 4, 8) = 0.0f;
+                SSPTR(tid_y, 5, 8) = 0.0f;
+                SSPTR(tid_y, 6, 8) = 0.0f;
+                SSPTR(tid_y, 7, 8) = 0.0f;
+                SSPTR(tid_y, 8, 8) = 0.0f;
+            }
         }
+        __syncthreads();
 
-        JacobiSVD<T>(A.ptr, V.ptr, 9, 9);
+        JacobiSVD<T>(9, 9);
 
         T vH[9], H_tmp[9];
         for (unsigned j = 0; j < 9; j++)
-            vH[j] = V.ptr[i * V.dims[0] * V.dims[1] + 8 * V.dims[0] + j];
+            vH[j] = s_V[tid_y * 81 + 8 * 9 + j];
 
         H_tmp[0] = src_scale*x_dst_mean*vH[6] + src_scale*vH[0]/dst_scale;
         H_tmp[1] = src_scale*x_dst_mean*vH[7] + src_scale*vH[1]/dst_scale;
@@ -337,7 +347,7 @@ __global__ void buildLinearSystem(
     }
 }
 
-#undef APTR
+#undef SSPTR
 
 // LMedS: http://research.microsoft.com/en-us/um/people/zhang/INRIA/Publis/Tutorial-Estim/node25.html
 template<typename T>
@@ -557,8 +567,6 @@ template<typename T>
 int computeH(
     Param<T> bestH,
     Param<T> H,
-    Param<T> A,
-    Param<T> V,
     Param<float> err,
     CParam<float> x_src,
     CParam<float> y_src,
@@ -574,8 +582,9 @@ int computeH(
     dim3 blocks(1, divup(iterations, threads.y));
 
     // Build linear system and solve SVD
-    CUDA_LAUNCH((buildLinearSystem<T>), blocks, threads,
-                H, A, V, x_src, y_src, x_dst, y_dst, rnd, iterations);
+    size_t ls_shared_sz = threads.x * 81 * 2 * sizeof(T);
+    CUDA_LAUNCH_SMEM((buildLinearSystem<T>), blocks, threads, ls_shared_sz,
+                H, x_src, y_src, x_dst, y_dst, rnd, iterations);
     POST_LAUNCH_CHECK();
 
     threads = dim3(256);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git