[arrayfire] 211/284: Handle CUDA devices locked in exclusive mode
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Sun Feb 7 18:59:34 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.
commit 968ae4e80ce8e6263fdc3f4381ae8b895df44bc4
Author: Shehzan Mohammed <shehzan at arrayfire.com>
Date: Mon Jan 11 17:11:44 2016 -0500
Handle CUDA devices locked in exclusive mode
* When the default device 0 is exclusively locked, ArrayFire will try to pick
a different device
* When the user uses setDevice to set a device that is locked, then ArrayFire
will error out
* Handle such a case when freeing memory in memory manager destructor
Signed-off-by: Shehzan Mohammed <shehzan at arrayfire.com>
---
src/backend/cuda/err_cuda.hpp | 37 +++++++++++++--------------
src/backend/cuda/memory.cpp | 12 +++++++--
src/backend/cuda/platform.cpp | 58 +++++++++++++++++++++++++++++++++++--------
3 files changed, 77 insertions(+), 30 deletions(-)
diff --git a/src/backend/cuda/err_cuda.hpp b/src/backend/cuda/err_cuda.hpp
index a975fb5..dd87bdf 100644
--- a/src/backend/cuda/err_cuda.hpp
+++ b/src/backend/cuda/err_cuda.hpp
@@ -17,22 +17,23 @@
__AF_FILENAME__, __LINE__, "CUDA"); \
} while(0)
-#define CUDA_CHECK(fn) do { \
- cudaError_t _cuda_error = fn; \
- if (_cuda_error != cudaSuccess) { \
- char cuda_err_msg[1024]; \
- snprintf(cuda_err_msg, \
- sizeof(cuda_err_msg), \
- "CUDA Error (%d): %s\n", \
- (int)(_cuda_error), \
- cudaGetErrorString( \
- cudaGetLastError())); \
- \
- if (_cuda_error == cudaErrorMemoryAllocation) { \
- AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM); \
- } else { \
- AF_ERROR(cuda_err_msg, \
- AF_ERR_INTERNAL); \
- } \
- } \
+#define CUDA_CHECK(fn) do { \
+ cudaError_t _cuda_error = fn; \
+ if (_cuda_error != cudaSuccess) { \
+ char cuda_err_msg[1024]; \
+ snprintf(cuda_err_msg, \
+ sizeof(cuda_err_msg), \
+ "CUDA Error (%d): %s\n", \
+ (int)(_cuda_error), \
+ cudaGetErrorString( \
+ cudaGetLastError())); \
+ \
+ if (_cuda_error == cudaErrorMemoryAllocation) { \
+ AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM); \
+ } else if (_cuda_error == cudaErrorDevicesUnavailable) {\
+ AF_ERROR(cuda_err_msg, AF_ERR_DRIVER); \
+ } else { \
+ AF_ERROR(cuda_err_msg, AF_ERR_INTERNAL); \
+ } \
+ } \
} while(0)
diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp
index 0e3fb5a..20e2547 100644
--- a/src/backend/cuda/memory.cpp
+++ b/src/backend/cuda/memory.cpp
@@ -45,8 +45,16 @@ public:
{
common::lock_guard_t lock(this->memory_mutex);
for (int n = 0; n < getDeviceCount(); n++) {
- cuda::setDevice(n);
- this->garbageCollect();
+ try {
+ cuda::setDevice(n);
+ this->garbageCollect();
+ } catch(AfError err) {
+ if(err.getError() == AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable
+ continue;
+ } else {
+ throw err;
+ }
+ }
}
}
};
diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp
index 72fc0bc..6919a04 100644
--- a/src/backend/cuda/platform.cpp
+++ b/src/backend/cuda/platform.cpp
@@ -386,20 +386,58 @@ void DeviceManager::sortDevices(sort_mode mode)
int DeviceManager::setActiveDevice(int device, int nId)
{
- if(device > (int)cuDevices.size()) {
- return -1;
- } else {
- int old = activeDev;
- if(nId == -1) nId = getDeviceNativeId(device);
- CUDA_CHECK(cudaSetDevice(nId));
- activeDev = device;
+ static bool first = true;
- if(!streams[device]) {
- CUDA_CHECK(cudaStreamCreate(&streams[device]));
- }
+ int numDevices = cuDevices.size();
+
+ if(device > numDevices) return -1;
+ int old = activeDev;
+ if(nId == -1) nId = getDeviceNativeId(device);
+ CUDA_CHECK(cudaSetDevice(nId));
+ cudaError_t err = cudaStreamCreate(&streams[device]);
+ activeDev = device;
+
+ if (err == cudaSuccess) return old;
+
+ // Comes when user sets device
+ // If success, return. Else throw error
+ if (!first) {
+ CUDA_CHECK(err);
return old;
}
+
+ // Comes only when first is true. Set it to false
+ first = false;
+
+ while(device < numDevices) {
+ // Check for errors other than DevicesUnavailable
+ // If success, return. Else throw error
+ // If DevicesUnavailable, try other devices (while loop below)
+ if (err != cudaErrorDevicesUnavailable) {
+ CUDA_CHECK(err);
+ activeDev = device;
+ return old;
+ }
+ cudaGetLastError(); // Reset error stack
+ printf("Warning: Device %d is unavailable. Incrementing to next device \n", device);
+
+ // Comes here is the device is in exclusive mode or
+ // otherwise fails streamCreate with this error.
+ // All other errors will error out
+ device++;
+
+ // Can't call getNativeId here as it will cause an infinite loop with the constructor
+ nId = cuDevices[device].nativeId;
+
+ CUDA_CHECK(cudaSetDevice(nId));
+ err = cudaStreamCreate(&streams[device]);
+ }
+
+ // If all devices fail with DevicesUnavailable, then throw this error
+ CUDA_CHECK(err);
+
+ return old;
}
void sync(int device)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list