[arrayfire] 211/284: Handle CUDA devices locked in exclusive mode

Sun Feb 7 18:59:34 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.

commit 968ae4e80ce8e6263fdc3f4381ae8b895df44bc4
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date:   Mon Jan 11 17:11:44 2016 -0500

    Handle CUDA devices locked in exclusive mode
    
    * When the default device 0 is exclusively locked, ArrayFire will try to pick
      a different device
    * When the user uses setDevice to set a device that is locked, then ArrayFire
      will error out
    * Handle such a case when freeing memory in memory manager destructor
    
    Signed-off-by: Shehzan Mohammed <shehzan at arrayfire.com>
---
 src/backend/cuda/err_cuda.hpp | 37 +++++++++++++--------------
 src/backend/cuda/memory.cpp   | 12 +++++++--
 src/backend/cuda/platform.cpp | 58 +++++++++++++++++++++++++++++++++++--------
 3 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/src/backend/cuda/err_cuda.hpp b/src/backend/cuda/err_cuda.hpp
index a975fb5..dd87bdf 100644
--- a/src/backend/cuda/err_cuda.hpp
+++ b/src/backend/cuda/err_cuda.hpp
@@ -17,22 +17,23 @@
                 __AF_FILENAME__, __LINE__, "CUDA");     \
     } while(0)
 
-#define CUDA_CHECK(fn) do {                                 \
-        cudaError_t _cuda_error = fn;                       \
-        if (_cuda_error != cudaSuccess) {                   \
-            char cuda_err_msg[1024];                        \
-            snprintf(cuda_err_msg,                          \
-                     sizeof(cuda_err_msg),                  \
-                     "CUDA Error (%d): %s\n",               \
-                     (int)(_cuda_error),                    \
-                     cudaGetErrorString(                    \
-                         cudaGetLastError()));              \
-                                                            \
-            if (_cuda_error == cudaErrorMemoryAllocation) { \
-                AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM);      \
-            } else {                                        \
-                AF_ERROR(cuda_err_msg,                      \
-                         AF_ERR_INTERNAL);                  \
-            }                                               \
-        }                                                   \
+#define CUDA_CHECK(fn) do {                                         \
+        cudaError_t _cuda_error = fn;                               \
+        if (_cuda_error != cudaSuccess) {                           \
+            char cuda_err_msg[1024];                                \
+            snprintf(cuda_err_msg,                                  \
+                     sizeof(cuda_err_msg),                          \
+                     "CUDA Error (%d): %s\n",                       \
+                     (int)(_cuda_error),                            \
+                     cudaGetErrorString(                            \
+                         cudaGetLastError()));                      \
+                                                                    \
+            if (_cuda_error == cudaErrorMemoryAllocation) {         \
+                AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM);              \
+            } else if (_cuda_error == cudaErrorDevicesUnavailable) {\
+                AF_ERROR(cuda_err_msg, AF_ERR_DRIVER);              \
+            } else {                                                \
+                AF_ERROR(cuda_err_msg, AF_ERR_INTERNAL);            \
+            }                                                       \
+        }                                                           \
     } while(0)
diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp
index 0e3fb5a..20e2547 100644
--- a/src/backend/cuda/memory.cpp
+++ b/src/backend/cuda/memory.cpp
@@ -45,8 +45,16 @@ public:
     {
         common::lock_guard_t lock(this->memory_mutex);
         for (int n = 0; n < getDeviceCount(); n++) {
-            cuda::setDevice(n);
-            this->garbageCollect();
+            try {
+                cuda::setDevice(n);
+                this->garbageCollect();
+            } catch(AfError err) {
+                if(err.getError() == AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable
+                    continue;
+                } else {
+                    throw err;
+                }
+            }
         }
     }
 };
diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp
index 72fc0bc..6919a04 100644
--- a/src/backend/cuda/platform.cpp
+++ b/src/backend/cuda/platform.cpp
@@ -386,20 +386,58 @@ void DeviceManager::sortDevices(sort_mode mode)
 
 int DeviceManager::setActiveDevice(int device, int nId)
 {
-    if(device > (int)cuDevices.size()) {
-        return -1;
-    } else {
-        int old = activeDev;
-        if(nId == -1) nId = getDeviceNativeId(device);
-        CUDA_CHECK(cudaSetDevice(nId));
-        activeDev = device;
+    static bool first = true;
 
-        if(!streams[device]) {
-            CUDA_CHECK(cudaStreamCreate(&streams[device]));
-        }
+    int numDevices = cuDevices.size();
+
+    if(device > numDevices) return -1;
 
+    int old = activeDev;
+    if(nId == -1) nId = getDeviceNativeId(device);
+    CUDA_CHECK(cudaSetDevice(nId));
+    cudaError_t err = cudaStreamCreate(&streams[device]);
+    activeDev = device;
+
+    if (err == cudaSuccess) return old;
+
+    // Comes when user sets device
+    // If success, return. Else throw error
+    if (!first) {
+        CUDA_CHECK(err);
         return old;
     }
+
+    // Comes only when first is true. Set it to false
+    first = false;
+
+    while(device < numDevices) {
+        // Check for errors other than DevicesUnavailable
+        // If success, return. Else throw error
+        // If DevicesUnavailable, try other devices (while loop below)
+        if (err != cudaErrorDevicesUnavailable) {
+            CUDA_CHECK(err);
+            activeDev = device;
+            return old;
+        }
+        cudaGetLastError(); // Reset error stack
+        printf("Warning: Device %d is unavailable. Incrementing to next device \n", device);
+
+        // Comes here is the device is in exclusive mode or
+        // otherwise fails streamCreate with this error.
+        // All other errors will error out
+        device++;
+
+        // Can't call getNativeId here as it will cause an infinite loop with the constructor
+        nId = cuDevices[device].nativeId;
+
+        CUDA_CHECK(cudaSetDevice(nId));
+        err = cudaStreamCreate(&streams[device]);
+    }
+
+    // If all devices fail with DevicesUnavailable, then throw this error
+    CUDA_CHECK(err);
+
+    return old;
 }
 
 void sync(int device)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git