[arrayfire] 107/408: PERF: Improvements to CUDA JIT for non linear 3D and 4D arrays

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:30 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit ecdab67780aa5b4b5f364d0da06768865385099d
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Mon Jul 6 13:17:40 2015 -0400

    PERF: Improvements to CUDA JIT for non linear 3D and 4D arrays
---
 src/backend/cuda/jit.cpp | 66 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp
index 3fb6d3e..c432e66 100644
--- a/src/backend/cuda/jit.cpp
+++ b/src/backend/cuda/jit.cpp
@@ -89,7 +89,7 @@ static string getKernelString(string funcName, Node *node, bool is_linear)
     kerStream << node->getTypeStr() <<"* %out,\n"
               << "i32 %ostr0, i32 %ostr1, i32 %ostr2, i32 %ostr3,\n"
               << "i32 %odim0, i32 %odim1, i32 %odim2, i32 %odim3,\n"
-              << "i32 %blkx, i32 %blky) {"
+              << "i32 %blkx, i32 %blky, i32 %ndims) {"
               << "\n\n";
 
     kerStream << "entry:\n\n";
@@ -105,15 +105,53 @@ static string getKernelString(string funcName, Node *node, bool is_linear)
         kerStream << "%tidy = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()\n";
         kerStream << "%bdmy = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()\n";
 
-        kerStream << "%id2 = sdiv i32 %bidx, %blkx\n";
-        kerStream << "%id3 = sdiv i32 %bidy, %blky\n";
-        kerStream << "%id2m = mul i32 %id2, %blkx\n";
-        kerStream << "%id3m = mul i32 %id3, %blky\n";
-        kerStream << "%blk_x = sub i32 %bidx, %id2m\n";
-        kerStream << "%blk_y = sub i32 %bidy, %id3m\n";
-        kerStream << "%id0m = mul i32 %blk_x, %bdmx\n";
-        kerStream << "%id1m = mul i32 %blk_y, %bdmy\n";
+        kerStream << "%blk_x = alloca i32, align 4\n";
+        kerStream << "%blk_y = alloca i32, align 4\n";
+        kerStream << "%id_3 = alloca i32, align 4\n";
+        kerStream << "%id_2 = alloca i32, align 4\n";
+        kerStream << "store i32 %bidx, i32* %blk_x, align 4\n";
+        kerStream << "store i32 %bidy, i32* %blk_y, align 4\n";
+        kerStream << "store i32 0, i32* %id_2, align 4\n";
+        kerStream << "store i32 0, i32* %id_3, align 4\n";
+
+        kerStream << "%two = alloca i32, align 4\n";
+        kerStream << "store i32 2, i32* %two, align 4\n";
+        kerStream << "%twoval = load i32* %two, align 4\n";
+        kerStream << "%is34 = icmp sgt i32 %ndims, %twoval\n";
+        kerStream << "br i1 %is34, label %do34, label %do2\n";
+
+        kerStream << "\ndo34:\n";
+
+        kerStream << "%id2t = sdiv i32 %bidx, %blkx\n";
+        kerStream << "store i32 %id2t, i32* %id_2, align 4\n";
+        kerStream << "%id2m = mul i32 %id2t, %blkx\n";
+        kerStream << "%blk_xx = sub i32 %bidx, %id2m\n";
+        kerStream << "store i32 %blk_xx, i32* %blk_x, align 4\n";
+
+        kerStream << "%three = alloca i32, align 4\n";
+        kerStream << "store i32 3, i32* %three, align 4\n";
+        kerStream << "%threeval = load i32* %three, align 4\n";
+        kerStream << "%is4 = icmp sgt i32 %ndims, %threeval\n";
+        kerStream << "br i1 %is4, label %do4, label %do2\n";
+
+        kerStream << "\ndo4:\n";
+        kerStream << "%id3t = sdiv i32 %bidy, %blky\n";
+        kerStream << "store i32 %id3t, i32* %id_3, align 4\n";
+        kerStream << "%id3m = mul i32 %id3t, %blky\n";
+        kerStream << "%blk_yy = sub i32 %bidy, %id3m\n";
+        kerStream << "store i32 %blk_yy, i32* %blk_y, align 4\n";
+        kerStream << "br label %do2\n";
+
+        kerStream << "\ndo2:\n";
+        kerStream << "%id2 = load i32* %id_2, align 4\n";
+        kerStream << "%id3 = load i32* %id_3, align 4\n";
+
+        kerStream << "%tmp_x = load i32* %blk_x, align 4\n";
+        kerStream << "%id0m = mul i32 %tmp_x, %bdmx\n";
         kerStream << "%id0 = add i32 %tidx, %id0m\n";
+
+        kerStream << "%tmp_y = load i32* %blk_y, align 4\n";
+        kerStream << "%id1m = mul i32 %tmp_y, %bdmy\n";
         kerStream << "%id1 = add i32 %tidy, %id1m\n";
         kerStream << "\n\n";
 
@@ -194,7 +232,7 @@ static string getKernelString(string funcName, Node *node, bool is_linear)
               << node->getTypeStr() << "*,\n"
               << "i32, i32, i32, i32,\n"
               << "i32, i32, i32, i32,\n"
-              << "i32, i32\n"
+              << "i32, i32, i32\n"
               << ")* " << funcName << ",\n "
               << "metadata !\"kernel\", i32 1}\n";
 
@@ -400,6 +438,13 @@ void evalNodes(Param<T> &out, Node *node)
     int blocks_x_ = 1, blocks_y_ = 1;
     int blocks_x  = 1, blocks_y = 1;
 
+    int num_odims = 4;
+
+    while (num_odims >= 1) {
+        if (out.dims[num_odims - 1] == 1) num_odims--;
+        else break;
+    }
+
     if (is_linear) {
 
         threads_x = 256;
@@ -430,6 +475,7 @@ void evalNodes(Param<T> &out, Node *node)
 
     args.push_back((void *)&blocks_x_);
     args.push_back((void *)&blocks_y_);
+    args.push_back((void *)&num_odims);
 
     CU_CHECK(cuLaunchKernel(ker,
                             blocks_x,

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git



More information about the debian-science-commits mailing list