[hamradio-commits] [gnss-sdr] 234/251: Multi-GPU device suport (splits the tracking channels between all the availables GPUs (CUDA only!))
Carles Fernandez
carles_fernandez-guest at moszumanska.debian.org
Wed Sep 2 00:22:56 UTC 2015
This is an automated email from the git hooks/post-receive script.
carles_fernandez-guest pushed a commit to branch master
in repository gnss-sdr.
commit fb2b12403a551669a70456918b9ddc066365e765
Author: Javier <jarribas at cttc.es>
Date: Fri Jul 24 18:07:33 2015 +0200
Multi-GPU device suport (splits the tracking channels between all the
availables GPUs (CUDA only!))
---
.../tracking/libs/cuda_multicorrelator.cu | 67 ++++++++++++++++------
.../tracking/libs/cuda_multicorrelator.h | 2 +
2 files changed, 52 insertions(+), 17 deletions(-)
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
index 3f027cb..5f97ee2 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -264,23 +264,54 @@ CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int
bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators)
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
- findCudaDevice(argc, (const char **)argv);
-
- cudaDeviceProp prop;
- int whichDevice;
- cudaGetDevice( &whichDevice );
- cudaGetDeviceProperties( &prop, whichDevice );
- //debug code
- if (prop.canMapHostMemory != 1) {
- printf( "Device can not map memory.\n" );
+// findCudaDevice(argc, (const char **)argv);
+ cudaDeviceProp prop;
+ int num_devices, device;
+ cudaGetDeviceCount(&num_devices);
+ num_gpu_devices=num_devices;
+ if (num_devices > 1) {
+ int max_multiprocessors = 0, max_device = 0;
+ for (device = 0; device < num_devices; device++) {
+ cudaDeviceProp properties;
+ cudaGetDeviceProperties(&properties, device);
+ if (max_multiprocessors < properties.multiProcessorCount) {
+ max_multiprocessors = properties.multiProcessorCount;
+ max_device = device;
+ }
+ printf("Found GPU device # %i\n",device);
+ }
+ //set random device!
+ selected_device=(rand() % num_devices);
+ printf("selected_device=%i\n",selected_device);
+ cudaGetDeviceProperties( &prop, selected_device );
+ //debug code
+ if (prop.canMapHostMemory != 1) {
+ printf( "Device can not map memory.\n" );
+ }
+ printf("L2 Cache size= %u \n",prop.l2CacheSize);
+ printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+ printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+ printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+ printf("deviceOverlap= %i \n",prop.deviceOverlap);
+ printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+ }else{
+ selected_device=0;
+ int whichDevice;
+ cudaGetDevice( &whichDevice );
+ cudaGetDeviceProperties( &prop, whichDevice );
+ //debug code
+ if (prop.canMapHostMemory != 1) {
+ printf( "Device can not map memory.\n" );
+ }
+
+ printf("L2 Cache size= %u \n",prop.l2CacheSize);
+ printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+ printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+ printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+ printf("deviceOverlap= %i \n",prop.deviceOverlap);
+ printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
}
- printf("L2 Cache size= %u \n",prop.l2CacheSize);
- printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
- printf("maxGridSize= %i \n",prop.maxGridSize[0]);
- printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
- printf("deviceOverlap= %i \n",prop.deviceOverlap);
- //end debug code
//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
@@ -288,7 +319,8 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
size_t size = signal_length_samples * sizeof(GPU_Complex);
-
+ cudaSetDevice(selected_device); //generates a random number between 0 and num_devices to split the threads between GPUs
+
checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
@@ -327,7 +359,7 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
{
size_t memSize = signal_length_samples * sizeof(std::complex<float>);
-
+ cudaSetDevice(selected_device); //generates a random number between 0 and num_devices to split the threads between GPUs
// input signal CPU -> GPU copy memory
checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
@@ -398,6 +430,7 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
bool cuda_multicorrelator::free_cuda()
{
+ cudaSetDevice(selected_device); //generates a random number between 0 and num_devices to split the threads between GPUs
// Free device global memory
cudaFree(d_sig_in);
//cudaFree(d_nco_in);
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h
index e29cba5..1a0f613 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -138,6 +138,8 @@ private:
cudaStream_t stream1;
cudaStream_t stream2;
+ int num_gpu_devices;
+ int selected_device;
};
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git
More information about the pkg-hamradio-commits
mailing list