[hamradio-commits] [gnss-sdr] 245/251: Some CUDA cleaning and documentation

Wed Sep 2 00:22:57 UTC 2015

This is an automated email from the git hooks/post-receive script.

carles_fernandez-guest pushed a commit to branch master
in repository gnss-sdr.

commit ef136e5c7489cdbc64a2fe8ee9c91a705fbb9697
Author: Carles Fernandez <carles.fernandez at gmail.com>
Date:   Tue Aug 25 19:01:02 2015 +0200

    Some CUDA cleaning and documentation
---
 CMakeLists.txt                                     |  33 ++--
 README.md                                          |  14 +-
 .../adapters/gps_l1_ca_dll_pll_tracking_gpu.h      |   3 +-
 .../gps_l1_ca_dll_pll_tracking_gpu_cc.cc           |  36 ++--
 .../gps_l1_ca_dll_pll_tracking_gpu_cc.h            |   2 +-
 .../tracking/libs/cuda_multicorrelator.h           | 204 ++++++++++-----------
 6 files changed, 154 insertions(+), 138 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e8affc..4e60216 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(ENABLE_GPROF "Enable the use of the GNU profiler tool 'gprof'" OFF)
 
 # Acceleration
 option(ENABLE_OPENCL "Enable building of processing blocks implemented with OpenCL (experimental)" OFF)
+option(ENABLE_CUDA "Enable building of processing blocks implemented with CUDA (experimental, requires CUDA SDK)" OFF)
 
 # Building and packaging options
 option(ENABLE_GENERIC_ARCH "Builds a portable binary" OFF)
@@ -883,6 +884,24 @@ else(ENABLE_OPENCL)
 endif(ENABLE_OPENCL)
 
 
+###############################################################################
+# CUDA (OPTIONAL)
+###############################################################################
+if($ENV{CUDA_GPU_ACCEL})
+    message(STATUS "CUDA_GPU_ACCEL environment variable found." )
+    set(ENABLE_CUDA ON)
+endif($ENV{CUDA_GPU_ACCEL})
+
+if(ENABLE_CUDA)
+    FIND_PACKAGE(CUDA REQUIRED)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
+    message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
+else(ENABLE_CUDA)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will be not enabled." )
+    message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for GPU-based acceleration using CUDA." )
+endif(ENABLE_CUDA)
+
+
 
 
 ################################################################################
@@ -955,20 +974,6 @@ else(ENABLE_OSMOSDR)
     message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" )
 endif(ENABLE_OSMOSDR)
 
-if($ENV{CUDA_GPU_ACCEL})
-    message(STATUS "CUDA_GPU_ACCEL environment variable found." )
-    set(ENABLE_CUDA ON)
-endif($ENV{CUDA_GPU_ACCEL})
-
-if(ENABLE_CUDA)
-    FIND_PACKAGE(CUDA REQUIRED)
-    message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
-    message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
-else(ENABLE_CUDA)
-    message(STATUS "NVIDIA CUDA GPU Acceleration will is not enabled." )
-    message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for GPU-based acceleration using CUDA." )
-endif(ENABLE_CUDA)
-
 
 if($ENV{FLEXIBAND_DRIVER})
     message(STATUS "FLEXIBAND_DRIVER environment variable found." )
diff --git a/README.md b/README.md
index 4ff47bf..757b432 100644
--- a/README.md
+++ b/README.md
@@ -345,6 +345,19 @@ $ sudo make install
 ~~~~~~ 
 
 
+###### Build CUDA support (OPTIONAL):
+
+In order to enable the building of blocks that use CUDA, NVIDIA's parallel programming model that enables graphics processing unit (GPU) acceleration for data-parallel computations, first you need to install the CUDA Toolkit from [NVIDIA Developers Download page](https://developer.nvidia.com/cuda-downloads "CUDA Downloads"). Then, build GNSS-SDR doing:
+
+~~~~~~ 
+$ cmake -DENABLE_CUDA=ON ../
+$ make
+$ sudo make install
+~~~~~~ 
+
+Of course, you will also need a GPU that [supports CUDA](https://developer.nvidia.com/cuda-gpus "CUDA GPUs").
+
+
 ###### Build a portable binary
 
 In order to build an executable that not depends on the specific SIMD instruction set that is present in the processor of the compiling machine, so other users can execute it in other machines without those particular sets, use:
@@ -841,7 +854,6 @@ Tracking_1C.dump=false ; Enable internal binary data file logging [true] or [fal
 Tracking_1C.dump_filename=./tracking_ch_ ; Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number.
 Tracking_1C.pll_bw_hz=50.0 ; PLL loop filter bandwidth [Hz]
 Tracking_1C.dll_bw_hz=2.0 ; DLL loop filter bandwidth [Hz]
-Tracking_1C.fll_bw_hz=10.0 ; FLL loop filter bandwidth [Hz]
 Tracking_1C.order=3 ; PLL/DLL loop filter order [2] or [3]
 Tracking_1C.early_late_space_chips=0.5 ; correlator early-late space [chips]. 
 ~~~~~~ 
diff --git a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
index 22bbe5d..36034e1 100644
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
@@ -51,7 +51,6 @@ class ConfigurationInterface;
 class GpsL1CaDllPllTrackingGPU : public TrackingInterface
 {
 public:
-
   GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration,
             std::string role,
             unsigned int in_streams,
@@ -65,7 +64,7 @@ public:
         return role_;
     }
 
-    //! Returns "GPS_L1_CA_DLL_PLL_Tracking"
+    //! Returns "GPS_L1_CA_DLL_PLL_Tracking_GPU"
     std::string implementation()
     {
         return "GPS_L1_CA_DLL_PLL_Tracking_GPU";
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
index 1cf5d03..339c372 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
     d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));
 
     multicorrelator_gpu = new cuda_multicorrelator();
-    int N_CORRELATORS=3;
+    int N_CORRELATORS = 3;
     //local code resampler on CPU (old)
     //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);
 
@@ -131,22 +131,22 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
     multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);
 
     // Get space for the resampled early / prompt / late local replicas
-	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));
+    checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));
 
 
     //allocate host memory
     //pinned memory mode - use special function to get OS-pinned memory
-	checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));
+    checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));
 
-	//old local codes vector
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
+    //old local codes vector
+    //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
 
-	//new integrated shifts
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
+    //new integrated shifts
+    //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
 
-	// correlator outputs (scalar)
-	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
-	//map to EPL pointers
+    // correlator outputs (scalar)
+    checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
+    //map to EPL pointers
     d_Early = &d_corr_outs_gpu[0];
     d_Prompt =  &d_corr_outs_gpu[1];
     d_Late = &d_corr_outs_gpu[2];
@@ -284,13 +284,13 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 {
     d_dump_file.close();
 
-	cudaFreeHost(in_gpu);
-	cudaFreeHost(d_carr_sign_gpu);
-	cudaFreeHost(d_corr_outs_gpu);
-	cudaFreeHost(d_local_code_shift_chips);
+    cudaFreeHost(in_gpu);
+    cudaFreeHost(d_carr_sign_gpu);
+    cudaFreeHost(d_corr_outs_gpu);
+    cudaFreeHost(d_local_code_shift_chips);
 
-	multicorrelator_gpu->free_cuda();
-	delete(multicorrelator_gpu);
+    multicorrelator_gpu->free_cuda();
+    delete(multicorrelator_gpu);
 
     volk_free(d_ca_code);
 
@@ -339,7 +339,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
             // UPDATE NCO COMMAND
             float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
 
-        	//code resampler on GPU (new)
+            //code resampler on GPU (new)
             float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in);
             float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);
 
@@ -353,7 +353,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
     				rem_code_phase_chips,
     				d_current_prn_length_samples,
     				3);
-        	cudaProfilerStop();
+            cudaProfilerStop();
 
             // ################## PLL ##########################################################
             // PLL discriminator
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
index 644751e..a3108f8 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@@ -130,7 +130,7 @@ private:
     gr_complex* in_gpu;
     gr_complex* d_carr_sign_gpu;
     gr_complex* d_local_codes_gpu;
-	float* d_local_code_shift_chips;
+    float* d_local_code_shift_chips;
     gr_complex* d_corr_outs_gpu;
     cuda_multicorrelator *multicorrelator_gpu;
 
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h
index 97594e5..df640f5 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -45,67 +45,67 @@
 #endif
 
 #include <complex>
-
 #include <cuda.h>
-// CUDA runtime
 #include <cuda_runtime.h>
 
 // GPU new internal data types for complex numbers
 
-struct GPU_Complex {
-	 float r;
-	 float i;
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
-		 return r * r + i * i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
-		#ifdef __CUDACC__
-		 return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
-		#else
-		 return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
-		#endif
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
-		 return GPU_Complex(r+a.r, i+a.i);
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
-		 r+=a.r;
-		 i+=a.i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
-	 {
-		 //c=a*b+c
-		 //real part
-		 //c.r=(a.r*b.r - a.i*b.i)+c.r
-		#ifdef __CUDACC__
-			 r=__fmaf_rn(a.r,b.r,r);
-			 r=__fmaf_rn(-a.i,b.i,r);
-			 //imag part
-			 i=__fmaf_rn(a.i,b.r,i);
-			 i=__fmaf_rn(a.r,b.i,i);
-		#else
-			 r=(a.r*b.r - a.i*b.i)+r;
-			 i=(a.i*b.r - a.r*b.i)+i;
-		#endif
+struct GPU_Complex
+{
+    float r;
+    float i;
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { return r * r + i * i; }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a)
+    {
+#ifdef __CUDACC__
+        return GPU_Complex(__fmul_rn(r, a.r) - __fmul_rn(i, a.i), __fmul_rn(i, a.r) + __fmul_rn(r, a.i));
+#else
+        return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
+#endif
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a)
+    {
+        return GPU_Complex(r + a.r, i + a.i);
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { r += a.r; i += a.i; }
+    CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
+    {
+        //c=a*b+c
+        //real part
+        //c.r=(a.r*b.r - a.i*b.i)+c.r
+#ifdef __CUDACC__
+        r = __fmaf_rn(a.r, b.r, r);
+        r = __fmaf_rn(-a.i, b.i, r);
+        //imag part
+        i = __fmaf_rn(a.i, b.r, i);
+        i = __fmaf_rn(a.r, b.i, i);
+#else
+        r = (a.r*b.r - a.i*b.i) + r;
+        i = (a.i*b.r - a.r*b.i) + i;
+#endif
 
-	 }
+    }
 };
 
-struct GPU_Complex_Short {
-	 float r;
-	 float i;
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
-		 return r * r + i * i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
-		 return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
-		 return GPU_Complex_Short(r+a.r, i+a.i);
-	 }
+struct GPU_Complex_Short
+{
+    float r;
+    float i;
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void )
+    {
+        return r * r + i * i;
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a)
+    {
+        return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a)
+    {
+        return GPU_Complex_Short(r+a.r, i+a.i);
+    }
 };
 /*!
  * \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators.
@@ -113,58 +113,58 @@ struct GPU_Complex_Short {
 class cuda_multicorrelator
 {
 public:
-	cuda_multicorrelator();
-	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
-	bool init_cuda_integrated_resampler(
-			const int argc, const char **argv,
-			int signal_length_samples,
-			int code_length_chips,
-			int n_correlators
-			);
-	bool set_local_code_and_taps(
-			int code_length_chips,
-			const std::complex<float>* local_codes_in,
-			float *shifts_chips,
-			int n_correlators
-			);
-	bool free_cuda();
-	bool Carrier_wipeoff_multicorrelator_cuda(
-			std::complex<float>* corr_out,
-			const std::complex<float>* sig_in,
-			const std::complex<float>* local_codes_in,
-			float rem_carrier_phase_in_rad,
-			float phase_step_rad,
-			const int *shifts_samples,
-			int signal_length_samples,
-			int n_correlators);
-	bool Carrier_wipeoff_multicorrelator_resampler_cuda(
-			std::complex<float>* corr_out,
-			const std::complex<float>* sig_in,
-			float rem_carrier_phase_in_rad,
-			float phase_step_rad,
-	        float code_phase_step_chips,
-	        float rem_code_phase_chips,
-			int signal_length_samples,
-			int n_correlators);
-private:
-	// Allocate the device input vectors
-	GPU_Complex *d_sig_in;
-	GPU_Complex *d_nco_in;
-	GPU_Complex *d_sig_doppler_wiped;
-	GPU_Complex *d_local_codes_in;
-	GPU_Complex *d_corr_out;
-	int *d_shifts_samples;
-	float *d_shifts_chips;
-	float d_code_length_chips;
+    cuda_multicorrelator();
+    bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
+    bool init_cuda_integrated_resampler(
+            const int argc, const char **argv,
+            int signal_length_samples,
+            int code_length_chips,
+            int n_correlators
+    );
+    bool set_local_code_and_taps(
+            int code_length_chips,
+            const std::complex<float>* local_codes_in,
+            float *shifts_chips,
+            int n_correlators
+    );
+    bool free_cuda();
+    bool Carrier_wipeoff_multicorrelator_cuda(
+            std::complex<float>* corr_out,
+            const std::complex<float>* sig_in,
+            const std::complex<float>* local_codes_in,
+            float rem_carrier_phase_in_rad,
+            float phase_step_rad,
+            const int *shifts_samples,
+            int signal_length_samples,
+            int n_correlators);
+    bool Carrier_wipeoff_multicorrelator_resampler_cuda(
+            std::complex<float>* corr_out,
+            const std::complex<float>* sig_in,
+            float rem_carrier_phase_in_rad,
+            float phase_step_rad,
+            float code_phase_step_chips,
+            float rem_code_phase_chips,
+            int signal_length_samples,
+            int n_correlators);
 
-	int threadsPerBlock;
-	int blocksPerGrid;
+private:
+    // Allocate the device input vectors
+    GPU_Complex *d_sig_in;
+    GPU_Complex *d_nco_in;
+    GPU_Complex *d_sig_doppler_wiped;
+    GPU_Complex *d_local_codes_in;
+    GPU_Complex *d_corr_out;
+    int *d_shifts_samples;
+    float *d_shifts_chips;
+    float d_code_length_chips;
 
-	cudaStream_t stream1;
-	cudaStream_t stream2;
-	int num_gpu_devices;
-	int selected_device;
+    int threadsPerBlock;
+    int blocksPerGrid;
 
+    cudaStream_t stream1;
+    cudaStream_t stream2;
+    int num_gpu_devices;
+    int selected_device;
 };
 
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git