[hamradio-commits] [gnss-sdr] 01/126: Removed CheckCudaErrors function and the cuda helpers dependencies in CUDA accelerators

Sat Dec 26 18:37:55 UTC 2015

This is an automated email from the git hooks/post-receive script.

carles_fernandez-guest pushed a commit to branch next
in repository gnss-sdr.

commit 6aef3478cfbde22d22cc429007073c5cd17eca16
Author: Javier Arribas <javiarribas at gmail.com>
Date:   Thu Sep 10 15:15:01 2015 +0200

    Removed CheckCudaErrors function and the cuda helpers dependencies in
    CUDA accelerators
---
 .../tracking/gnuradio_blocks/CMakeLists.txt        |   2 +-
 .../gps_l1_ca_dll_pll_tracking_gpu_cc.cc           |  13 ++-
 src/algorithms/tracking/libs/CMakeLists.txt        |   2 +-
 .../tracking/libs/cuda_multicorrelator.cu          | 104 ++++++++++-----------
 .../tracking/libs/cuda_multicorrelator.h           |   1 -
 5 files changed, 58 insertions(+), 64 deletions(-)

diff --git a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
index a018fe1..6af7fb3 100644
--- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
+++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
@@ -48,7 +48,7 @@ include_directories(
      ${GNURADIO_RUNTIME_INCLUDE_DIRS}
      ${VOLK_GNSSSDR_INCLUDE_DIRS}
      ${CUDA_INCLUDE_DIRS}
-     ${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers
+     # ${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers
 )
 
 if(ENABLE_GENERIC_ARCH)
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
index 1cf5d03..d16a0c6 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@@ -50,8 +50,7 @@
 #include <volk/volk.h> //volk_alignement
 // includes
 #include <cuda_profiler_api.h>
-#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
-#include <helper_cuda.h>       // helper functions for CUDA error checking and initialization
+
 
 /*!
  * \todo Include in definition header file
@@ -131,21 +130,21 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
     multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);
 
     // Get space for the resampled early / prompt / late local replicas
-	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));
+	cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped );
 
 
     //allocate host memory
     //pinned memory mode - use special function to get OS-pinned memory
-	checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));
+	cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped );
 
 	//old local codes vector
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
+	// (cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
 
 	//new integrated shifts
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
+	// (cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
 
 	// correlator outputs (scalar)
-	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
+	cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined );
 	//map to EPL pointers
     d_Early = &d_corr_outs_gpu[0];
     d_Prompt =  &d_corr_outs_gpu[1];
diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt
index e6f66fa..c6c8c8c 100644
--- a/src/algorithms/tracking/libs/CMakeLists.txt
+++ b/src/algorithms/tracking/libs/CMakeLists.txt
@@ -28,7 +28,7 @@ if(ENABLE_CUDA)
 	
 	CUDA_INCLUDE_DIRECTORIES(
      ${CMAKE_CURRENT_SOURCE_DIR}
-     ${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers
+     #${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers
 	)
 
 	SET(LIB_TYPE STATIC) #set the lib type
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
index 166bca3..43ffeed 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -49,9 +49,6 @@
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_runtime.h>
 
-// helper functions and utilities to work with CUDA
-#include <helper_cuda.h>
-#include <helper_functions.h>
 
 #define ACCUM_N 256
 
@@ -224,7 +221,6 @@ __global__ void scalarProdGPUCPXxN(
         //int vectorBase = IMUL(elementN, vec);
         //int vectorEnd  = vectorBase + elementN;
 
-
         ////////////////////////////////////////////////////////////////////////
         // Each accumulator cycles through vectors with
         // stride equal to number of total number of accumulators ACCUM_N
@@ -392,28 +388,28 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign
 //    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
 //    }
 
-	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
 
 
     // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
 
     size_t size = signal_length_samples * sizeof(GPU_Complex);
 
-	checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
-	//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
-	checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
+	cudaMalloc((void **)&d_sig_in, size);
+	// (cudaMalloc((void **)&d_nco_in, size));
+	cudaMalloc((void **)&d_sig_doppler_wiped, size);
 
 	// old version: all local codes are independent vectors
-	//checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
+	// (cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
 
 	// new version: only one vector with extra samples to shift the local code for the correlator set
 	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
     size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
-	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes));
-	checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators));
+	cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes);
+	cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators);
 
 	//scalars
-	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
+	cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
 
     // Launch the Vector Add CUDA Kernel
 	threadsPerBlock = 256;
@@ -481,30 +477,30 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
 //    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
 //    }
 
-	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
 
     // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
 
     size_t size = signal_length_samples * sizeof(GPU_Complex);
 
-	checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
-	checkCudaErrors(cudaMemset(d_sig_in,0,size));
+	cudaMalloc((void **)&d_sig_in, size);
+	cudaMemset(d_sig_in,0,size);
 
-	//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
-	checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
-	checkCudaErrors(cudaMemset(d_sig_doppler_wiped,0,size));
+	// (cudaMalloc((void **)&d_nco_in, size));
+	cudaMalloc((void **)&d_sig_doppler_wiped, size);
+	cudaMemset(d_sig_doppler_wiped,0,size);
 
-	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips));
-	checkCudaErrors(cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips));
+	cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips);
+	cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips);
 
     d_code_length_chips=code_length_chips;
 
-	checkCudaErrors(cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators));
-	checkCudaErrors(cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators));
+	cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
+	cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);
 
 	//scalars
-	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
-	checkCudaErrors(cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators));
+	cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
+	cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
 
     // Launch the Vector Add CUDA Kernel
 	threadsPerBlock = 256;
@@ -523,12 +519,12 @@ bool cuda_multicorrelator::set_local_code_and_taps(
 		)
 {
     // local code CPU -> GPU copy memory
-    checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1));
+    cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
     d_code_length_chips=(float)code_length_chips;
 
     // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
-    checkCudaErrors(cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
-                                    cudaMemcpyHostToDevice,stream1));
+    cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
+                                    cudaMemcpyHostToDevice,stream1);
 
 	return true;
 }
@@ -550,40 +546,40 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
 
 	// input signal CPU -> GPU copy memory
 
-    checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
-                                    cudaMemcpyHostToDevice, stream1));
+    cudaMemcpyAsync(d_sig_in, sig_in, memSize,
+                                    cudaMemcpyHostToDevice, stream1);
 
     //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
-    //checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize,
+    // (cudaMemcpyAsync(d_nco_in, nco_in, memSize,
     //                                cudaMemcpyHostToDevice, stream1));
 
 
 	// old version: all local codes are independent vectors
-    //checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
+    // (cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
     //                                cudaMemcpyHostToDevice, stream2));
 
 	// new version: only one vector with extra samples to shift the local code for the correlator set
 	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
 
     // local code CPU -> GPU copy memory
-    checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
-                                    cudaMemcpyHostToDevice, stream2));
+    cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
+                                    cudaMemcpyHostToDevice, stream2);
     // Correlator shifts vector CPU -> GPU copy memory
-    checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
-                                    cudaMemcpyHostToDevice, stream2));
+    cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
+                                    cudaMemcpyHostToDevice, stream2);
 
 
     //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
-    checkCudaErrors(cudaStreamSynchronize(stream1));
+    cudaStreamSynchronize(stream1);
     CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
 
 
     //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 
     //wait for Doppler wipeoff end...
-    checkCudaErrors(cudaStreamSynchronize(stream1));
-    checkCudaErrors(cudaStreamSynchronize(stream2));
-    //checkCudaErrors(cudaDeviceSynchronize());
+    cudaStreamSynchronize(stream1);
+    cudaStreamSynchronize(stream2);
+    // (cudaDeviceSynchronize());
 
     //old
 //    scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
@@ -604,15 +600,15 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
 			n_correlators,
 			signal_length_samples
 		);
-    checkCudaErrors(cudaGetLastError());
+    cudaGetLastError();
     //wait for correlators end...
-    checkCudaErrors(cudaStreamSynchronize(stream2));
+    cudaStreamSynchronize(stream2);
     // Copy the device result vector in device memory to the host result vector
     // in host memory.
 
     //scalar products (correlators outputs)
-    checkCudaErrors(cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
-            cudaMemcpyDeviceToHost));
+    cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+            cudaMemcpyDeviceToHost);
     return true;
 }
 
@@ -629,19 +625,19 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
 
 	size_t memSize = signal_length_samples * sizeof(std::complex<float>);
 	// input signal CPU -> GPU copy memory
-    checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
-                                    cudaMemcpyHostToDevice, stream2));
+    cudaMemcpyAsync(d_sig_in, sig_in, memSize,
+                                    cudaMemcpyHostToDevice, stream2);
 
     //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
 
     //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
-    checkCudaErrors(cudaStreamSynchronize(stream2));
+    cudaStreamSynchronize(stream2);
 
     CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
 
     //wait for Doppler wipeoff end...
-    checkCudaErrors(cudaStreamSynchronize(stream1));
-    checkCudaErrors(cudaStreamSynchronize(stream2));
+    cudaStreamSynchronize(stream1);
+    cudaStreamSynchronize(stream2);
 
     //launch the multitap correlator with integrated local code resampler!
 
@@ -657,16 +653,16 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
 			signal_length_samples
 		);
 
-    checkCudaErrors(cudaGetLastError());
+    cudaGetLastError();
     //wait for correlators end...
-    checkCudaErrors(cudaStreamSynchronize(stream1));
+    cudaStreamSynchronize(stream1);
     // Copy the device result vector in device memory to the host result vector
     // in host memory.
 
     //scalar products (correlators outputs)
-    checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
-            cudaMemcpyDeviceToHost,stream1));
-    checkCudaErrors(cudaStreamSynchronize(stream1));
+    cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+            cudaMemcpyDeviceToHost,stream1);
+    cudaStreamSynchronize(stream1);
     return true;
 }
 
@@ -708,7 +704,7 @@ bool cuda_multicorrelator::free_cuda()
     // needed to ensure correct operation when the application is being
     // profiled. Calling cudaDeviceReset causes all profile data to be
     // flushed before the application exits
-	//checkCudaErrors(cudaDeviceReset());
+	// (cudaDeviceReset());
 	return true;
 }
 
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h
index 97594e5..72fa8db 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -167,5 +167,4 @@ private:
 
 };
 
-
 #endif /* CUDA_MULTICORRELATOR_H_ */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git