[hamradio-commits] [gnss-sdr] 15/126: Pinned memory (Zero copy) huge improvement for GPU tracking.
Carles Fernandez
carles_fernandez-guest at moszumanska.debian.org
Sat Dec 26 18:37:56 UTC 2015
This is an automated email from the git hooks/post-receive script.
carles_fernandez-guest pushed a commit to branch next
in repository gnss-sdr.
commit 2039e998ffbf5319535fe116b55a0d84f455235f
Author: Javier Arribas <javiarribas at gmail.com>
Date: Thu Oct 15 19:09:09 2015 +0200
Pinned memory (Zero copy) huge improvement for GPU tracking.
---
conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf | 4 +-
.../gps_l1_ca_dll_pll_tracking_gpu_cc.h | 4 -
.../tracking/libs/cuda_multicorrelator.cu | 523 +++++++--------------
.../tracking/libs/cuda_multicorrelator.h | 24 +-
4 files changed, 170 insertions(+), 385 deletions(-)
diff --git a/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf
index 6bfc9bb..7d3c371 100644
--- a/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf
+++ b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf
@@ -17,7 +17,7 @@ ControlThread.wait_for_flowgraph=false
SignalSource.implementation=File_Signal_Source
;#filename: path to file with the captured GNSS signal samples to be processed
-SignalSource.filename=/home/javier/signals/4msps.dat
+SignalSource.filename=/media/javier/SISTEMA/signals/New York/4msps.dat
;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
SignalSource.item_type=gr_complex
@@ -165,7 +165,7 @@ Resampler.sample_freq_out=4000000
;######### CHANNELS GLOBAL CONFIG ############
;#count: Number of available GPS satellite channels.
-Channels_GPS.count=1
+Channels_GPS.count=8
;#count: Number of available Galileo satellite channels.
Channels_Galileo.count=0
;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
index a3108f8..e632c48 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@@ -128,13 +128,9 @@ private:
//GPU HOST PINNED MEMORY IN/OUT VECTORS
gr_complex* in_gpu;
- gr_complex* d_carr_sign_gpu;
- gr_complex* d_local_codes_gpu;
float* d_local_code_shift_chips;
gr_complex* d_corr_outs_gpu;
cuda_multicorrelator *multicorrelator_gpu;
-
-
gr_complex* d_ca_code;
gr_complex *d_Early;
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
index 43ffeed..5017d14 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -32,26 +32,14 @@
* -------------------------------------------------------------------------
*/
-///////////////////////////////////////////////////////////////////////////////
-// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
-// (the same as for floating point multiplication and addition),
-// whereas full 32-bit multiplication takes 16 clocks per warp.
-// So if integer multiplication operands are guaranteed to fit into 24 bits
-// (always lie withtin [-8M, 8M - 1] range in signed case),
-// explicit 24-bit multiplication is preferred for performance.
-///////////////////////////////////////////////////////////////////////////////
-#define IMUL(a, b) __mul24(a, b)
-
#include "cuda_multicorrelator.h"
#include <stdio.h>
-
+#include <iostream>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
-
-#define ACCUM_N 256
-
+#define ACCUM_N 128
__global__ void scalarProdGPUCPXxN_shifts_chips(
GPU_Complex *d_corr_out,
@@ -90,15 +78,17 @@ __global__ void scalarProdGPUCPXxN_shifts_chips(
for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
{
+ //original sample code
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
+ //custom code for multitap correlator
// 1.resample local code for the current shift
float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
- //TODO: Take into account that in multitap correlators, the shifts can be negative!
+ //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
-
+ //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
// 2.correlate
sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
@@ -127,90 +117,57 @@ __global__ void scalarProdGPUCPXxN_shifts_chips(
}
}
-
-///////////////////////////////////////////////////////////////////////////////
-// Calculate scalar products of VectorN vectors of ElementN elements on GPU
-// Parameters restrictions:
-// 1) ElementN is strongly preferred to be a multiple of warp size to
-// meet alignment constraints of memory coalescing.
-// 2) ACCUM_N must be a power of two.
-///////////////////////////////////////////////////////////////////////////////
-
-
-__global__ void scalarProdGPUCPXxN_shifts(
- GPU_Complex *d_corr_out,
- GPU_Complex *d_sig_in,
- GPU_Complex *d_local_codes_in,
- int *d_shifts_samples,
- int vectorN,
- int elementN
-)
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel
+ */
+__global__ void
+CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
{
- //Accumulators cache
- __shared__ GPU_Complex accumResult[ACCUM_N];
-
- ////////////////////////////////////////////////////////////////////////////
- // Cycle through every pair of vectors,
- // taking into account that vector counts can be different
- // from total number of thread blocks
- ////////////////////////////////////////////////////////////////////////////
- for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+ // CUDA version of floating point NCO and vector dot product integrated
+ float sin;
+ float cos;
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+ i < numElements;
+ i += blockDim.x * gridDim.x)
{
- int vectorBase = IMUL(elementN, vec);
- int vectorEnd = vectorBase + elementN;
-
- ////////////////////////////////////////////////////////////////////////
- // Each accumulator cycles through vectors with
- // stride equal to number of total number of accumulators ACCUM_N
- // At this stage ACCUM_N is only preferred be a multiple of warp size
- // to meet memory coalescing alignment constraints.
- ////////////////////////////////////////////////////////////////////////
- for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
- {
- GPU_Complex sum = GPU_Complex(0,0);
-
- for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
- {
- //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
- //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
- sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]);
- }
- accumResult[iAccum] = sum;
- }
-
- ////////////////////////////////////////////////////////////////////////
- // Perform tree-like reduction of accumulators' results.
- // ACCUM_N has to be power of two at this stage
- ////////////////////////////////////////////////////////////////////////
- for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
- {
- __syncthreads();
-
- for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
- {
- accumResult[iAccum] += accumResult[stride + iAccum];
- }
- }
-
- if (threadIdx.x == 0)
- {
- d_corr_out[vec] = accumResult[0];
- }
+ __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
+ sig_out[i] = sig_in[i] * GPU_Complex(cos,-sin);
}
}
-__global__ void scalarProdGPUCPXxN(
+__global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
GPU_Complex *d_corr_out,
GPU_Complex *d_sig_in,
- GPU_Complex *d_local_codes_in,
+ GPU_Complex *d_sig_wiped,
+ GPU_Complex *d_local_code_in,
+ float *d_shifts_chips,
+ float code_length_chips,
+ float code_phase_step_chips,
+ float rem_code_phase_chips,
int vectorN,
- int elementN
+ int elementN,
+ float rem_carrier_phase_in_rad,
+ float phase_step_rad
)
{
//Accumulators cache
__shared__ GPU_Complex accumResult[ACCUM_N];
+ // CUDA version of floating point NCO and vector dot product integrated
+ float sin;
+ float cos;
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+ i < elementN;
+ i += blockDim.x * gridDim.x)
+ {
+ __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
+ d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos,-sin);
+ }
+
+ __syncthreads();
////////////////////////////////////////////////////////////////////////////
// Cycle through every pair of vectors,
// taking into account that vector counts can be different
@@ -219,7 +176,7 @@ __global__ void scalarProdGPUCPXxN(
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
{
//int vectorBase = IMUL(elementN, vec);
- //int vectorEnd = vectorBase + elementN;
+ //int vectorEnd = elementN;
////////////////////////////////////////////////////////////////////////
// Each accumulator cycles through vectors with
@@ -230,14 +187,26 @@ __global__ void scalarProdGPUCPXxN(
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
{
GPU_Complex sum = GPU_Complex(0,0);
-
- //for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
- for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
+ float local_code_chip_index;
+ //float code_phase;
+ for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
{
+ //original sample code
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
- //sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
- sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]);
+ //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
+
+ //custom code for multitap correlator
+ // 1.resample local code for the current shift
+
+ local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
+
+ //Take into account that in multitap correlators, the shifts can be negative!
+ if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
+ //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
+ // 2.correlate
+ sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
+
}
accumResult[iAccum] = sum;
}
@@ -263,166 +232,7 @@ __global__ void scalarProdGPUCPXxN(
}
}
-
-//*********** CUDA processing **************
-// Treads: a minimal parallel execution code on GPU
-// Blocks: a set of N threads
-/**
- * CUDA Kernel Device code
- *
- * Computes the vectorial product of A and B into C. The 3 vectors have the same
- * number of elements numElements.
- */
-__global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
-{
- for (int i = blockIdx.x * blockDim.x + threadIdx.x;
- i < numElements;
- i += blockDim.x * gridDim.x)
- {
- C[i] = A[i] * B[i];
- }
-}
-
-
-/**
- * CUDA Kernel Device code
- *
- * Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel
- */
-__global__ void
-CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
-{
- //*** NCO CPU code (GNURadio FXP NCO)
- //float sin_f, cos_f;
- //float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
- //int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad);
- //int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad);
- //
- //for(int i = 0; i < d_current_prn_length_samples; i++)
- // {
- // gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f);
- // d_carr_sign[i] = std::complex<float>(cos_f, -sin_f);
- // phase_rad_i += phase_step_rad_i;
- // }
-
- // CUDA version of floating point NCO and vector dot product integrated
-
- float sin;
- float cos;
- for (int i = blockIdx.x * blockDim.x + threadIdx.x;
- i < numElements;
- i += blockDim.x * gridDim.x)
- {
- __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
- sig_out[i] = sig_in[i] * GPU_Complex(cos,-sin);
- }
-}
-
-
-/**
- * CUDA Kernel Device code
- *
- * Computes the vectorial product of A and B into C. The 3 vectors have the same
- * number of elements numElements.
- */
-__global__ void
-CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
-{
- for (int i = blockIdx.x * blockDim.x + threadIdx.x;
- i < numElements;
- i += blockDim.x * gridDim.x)
- {
- C[i] = A[i] + B[i];
- }
-}
-
-
-bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators)
-{
- // use command-line specified CUDA device, otherwise use device with highest Gflops/s
-// findCudaDevice(argc, (const char **)argv);
-// cudaDeviceProp prop;
-// int num_devices, device;
-// cudaGetDeviceCount(&num_devices);
-// if (num_devices > 1) {
-// int max_multiprocessors = 0, max_device = 0;
-// for (device = 0; device < num_devices; device++) {
-// cudaDeviceProp properties;
-// cudaGetDeviceProperties(&properties, device);
-// if (max_multiprocessors < properties.multiProcessorCount) {
-// max_multiprocessors = properties.multiProcessorCount;
-// max_device = device;
-// }
-// printf("Found GPU device # %i\n",device);
-// }
-// //cudaSetDevice(max_device);
-//
-// //set random device!
-// cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
-//
-// cudaGetDeviceProperties( &prop, max_device );
-// //debug code
-// if (prop.canMapHostMemory != 1) {
-// printf( "Device can not map memory.\n" );
-// }
-// printf("L2 Cache size= %u \n",prop.l2CacheSize);
-// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
-// printf("maxGridSize= %i \n",prop.maxGridSize[0]);
-// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
-// printf("deviceOverlap= %i \n",prop.deviceOverlap);
-// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
-// }else{
-// int whichDevice;
-// cudaGetDevice( &whichDevice );
-// cudaGetDeviceProperties( &prop, whichDevice );
-// //debug code
-// if (prop.canMapHostMemory != 1) {
-// printf( "Device can not map memory.\n" );
-// }
-//
-// printf("L2 Cache size= %u \n",prop.l2CacheSize);
-// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
-// printf("maxGridSize= %i \n",prop.maxGridSize[0]);
-// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
-// printf("deviceOverlap= %i \n",prop.deviceOverlap);
-// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
-// }
-
- // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
-
-
- // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
-
- size_t size = signal_length_samples * sizeof(GPU_Complex);
-
- cudaMalloc((void **)&d_sig_in, size);
- // (cudaMalloc((void **)&d_nco_in, size));
- cudaMalloc((void **)&d_sig_doppler_wiped, size);
-
- // old version: all local codes are independent vectors
- // (cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
-
- // new version: only one vector with extra samples to shift the local code for the correlator set
- // Required: The last correlator tap in d_shifts_samples has the largest sample shift
- size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
- cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes);
- cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators);
-
- //scalars
- cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
-
- // Launch the Vector Add CUDA Kernel
- threadsPerBlock = 256;
- blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
-
- cudaStreamCreate (&stream1) ;
- cudaStreamCreate (&stream2) ;
- return true;
-}
-
-
bool cuda_multicorrelator::init_cuda_integrated_resampler(
- const int argc, const char **argv,
int signal_length_samples,
int code_length_chips,
int n_correlators
@@ -480,34 +290,45 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
-
size_t size = signal_length_samples * sizeof(GPU_Complex);
- cudaMalloc((void **)&d_sig_in, size);
- cudaMemset(d_sig_in,0,size);
+ //********* ZERO COPY VERSION ************
+ // Set flag to enable zero copy access
+ // Optimal in shared memory devices (like Jetson K1)
+ cudaSetDeviceFlags(cudaDeviceMapHost);
+
+ //******** CudaMalloc version ***********
+
+ // input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
+ // cudaMalloc((void **)&d_sig_in, size);
+ // cudaMemset(d_sig_in,0,size);
- // (cudaMalloc((void **)&d_nco_in, size));
+ // Doppler-free signal (internal GPU memory)
cudaMalloc((void **)&d_sig_doppler_wiped, size);
cudaMemset(d_sig_doppler_wiped,0,size);
+ // Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips);
cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips);
d_code_length_chips=code_length_chips;
+ // Vector with the chip shifts for each correlator tap
+ //GPU memory (can be mapped to CPU memory in shared memory devices!)
cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);
//scalars
- cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
- cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
+ //cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
+ //cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
// Launch the Vector Add CUDA Kernel
- threadsPerBlock = 256;
+ // TODO: write a smart load balance using device info!
+ threadsPerBlock = 64;
blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
cudaStreamCreate (&stream1) ;
- cudaStreamCreate (&stream2) ;
+ //cudaStreamCreate (&stream2) ;
return true;
}
@@ -518,103 +339,57 @@ bool cuda_multicorrelator::set_local_code_and_taps(
int n_correlators
)
{
- // local code CPU -> GPU copy memory
+ //********* ZERO COPY VERSION ************
+// // Get device pointer from host memory. No allocation or memcpy
+// cudaError_t code;
+// // local code CPU -> GPU copy memory
+// code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0);
+// if (code!=cudaSuccess)
+// {
+// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
+// }
+// // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
+// code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0);
+// if (code!=cudaSuccess)
+// {
+// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
+// }
+
+ //******** CudaMalloc version ***********
+ //local code CPU -> GPU copy memory
cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
d_code_length_chips=(float)code_length_chips;
- // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
+ //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
cudaMemcpyHostToDevice,stream1);
return true;
}
-
-
-bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
+bool cuda_multicorrelator::set_input_output_vectors(
std::complex<float>* corr_out,
- const std::complex<float>* sig_in,
- const std::complex<float>* local_codes_in,
- float rem_carrier_phase_in_rad,
- float phase_step_rad,
- const int *shifts_samples,
- int signal_length_samples,
- int n_correlators)
- {
-
- size_t memSize = signal_length_samples * sizeof(std::complex<float>);
-
- // input signal CPU -> GPU copy memory
-
- cudaMemcpyAsync(d_sig_in, sig_in, memSize,
- cudaMemcpyHostToDevice, stream1);
-
- //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
- // (cudaMemcpyAsync(d_nco_in, nco_in, memSize,
- // cudaMemcpyHostToDevice, stream1));
-
-
- // old version: all local codes are independent vectors
- // (cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
- // cudaMemcpyHostToDevice, stream2));
-
- // new version: only one vector with extra samples to shift the local code for the correlator set
- // Required: The last correlator tap in d_shifts_samples has the largest sample shift
-
- // local code CPU -> GPU copy memory
- cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
- cudaMemcpyHostToDevice, stream2);
- // Correlator shifts vector CPU -> GPU copy memory
- cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
- cudaMemcpyHostToDevice, stream2);
-
-
- //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
- cudaStreamSynchronize(stream1);
- CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
-
+ std::complex<float>* sig_in
+ )
+{
- //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+ // Save CPU pointers
+ d_sig_in_cpu =sig_in;
+ d_corr_out_cpu = corr_out;
- //wait for Doppler wipeoff end...
- cudaStreamSynchronize(stream1);
- cudaStreamSynchronize(stream2);
- // (cudaDeviceSynchronize());
-
- //old
-// scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
-// d_corr_out,
-// d_sig_doppler_wiped,
-// d_local_codes_in,
-// 3,
-// signal_length_samples
-// );
-
- //new
- //launch the multitap correlator
- scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
- d_corr_out,
- d_sig_doppler_wiped,
- d_local_codes_in,
- d_shifts_samples,
- n_correlators,
- signal_length_samples
- );
- cudaGetLastError();
- //wait for correlators end...
- cudaStreamSynchronize(stream2);
- // Copy the device result vector in device memory to the host result vector
- // in host memory.
+ // Zero Copy version
+ // Get device pointer from host memory. No allocation or memcpy
+ cudaError_t code;
+ code=cudaHostGetDevicePointer((void **)&d_sig_in, (void *) sig_in, 0);
+ code=cudaHostGetDevicePointer((void **)&d_corr_out, (void *) corr_out, 0);
+ if (code!=cudaSuccess)
+ {
+ printf("cuda cudaHostGetDevicePointer error \r\n");
+ }
+ return true;
- //scalar products (correlators outputs)
- cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
- cudaMemcpyDeviceToHost);
- return true;
}
-
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
- std::complex<float>* corr_out,
- const std::complex<float>* sig_in,
float rem_carrier_phase_in_rad,
float phase_step_rad,
float code_phase_step_chips,
@@ -623,26 +398,40 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
int n_correlators)
{
- size_t memSize = signal_length_samples * sizeof(std::complex<float>);
+
+ // cudaMemCpy version
+ //size_t memSize = signal_length_samples * sizeof(std::complex<float>);
// input signal CPU -> GPU copy memory
- cudaMemcpyAsync(d_sig_in, sig_in, memSize,
- cudaMemcpyHostToDevice, stream2);
+ //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
+ // cudaMemcpyHostToDevice, stream2);
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
-
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
- cudaStreamSynchronize(stream2);
+ //cudaStreamSynchronize(stream2);
- CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
+ //CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
//wait for Doppler wipeoff end...
- cudaStreamSynchronize(stream1);
- cudaStreamSynchronize(stream2);
+ //cudaStreamSynchronize(stream1);
+ //cudaStreamSynchronize(stream2);
//launch the multitap correlator with integrated local code resampler!
- scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
+// scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
+// d_corr_out,
+// d_sig_doppler_wiped,
+// d_local_codes_in,
+// d_shifts_chips,
+// d_code_length_chips,
+// code_phase_step_chips,
+// rem_code_phase_chips,
+// n_correlators,
+// signal_length_samples
+// );
+
+ Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
d_corr_out,
+ d_sig_in,
d_sig_doppler_wiped,
d_local_codes_in,
d_shifts_chips,
@@ -650,23 +439,33 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
code_phase_step_chips,
rem_code_phase_chips,
n_correlators,
- signal_length_samples
- );
-
- cudaGetLastError();
+ signal_length_samples,
+ rem_carrier_phase_in_rad,
+ phase_step_rad
+ );
+
+ //debug
+// std::complex<float>* debug_signal;
+// debug_signal=static_cast<std::complex<float>*>(malloc(memSize));
+// cudaMemcpyAsync(debug_signal, d_sig_doppler_wiped, memSize,
+// cudaMemcpyDeviceToHost,stream1);
+// cudaStreamSynchronize(stream1);
+// std::cout<<"d_sig_doppler_wiped GPU="<<debug_signal[456]<<","<<debug_signal[1]<<","<<debug_signal[2]<<","<<debug_signal[3]<<std::endl;
+
+ //cudaGetLastError();
//wait for correlators end...
- cudaStreamSynchronize(stream1);
+ //cudaStreamSynchronize(stream1);
// Copy the device result vector in device memory to the host result vector
// in host memory.
//scalar products (correlators outputs)
- cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
- cudaMemcpyDeviceToHost,stream1);
+ //cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+ // cudaMemcpyDeviceToHost,stream1);
+
cudaStreamSynchronize(stream1);
return true;
}
-
cuda_multicorrelator::cuda_multicorrelator()
{
d_sig_in=NULL;
@@ -689,22 +488,16 @@ bool cuda_multicorrelator::free_cuda()
if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
if (d_corr_out!=NULL) cudaFree(d_corr_out);
-
-
if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
-
- cudaStreamDestroy(stream1) ;
- cudaStreamDestroy(stream2) ;
-
// Reset the device and exit
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
- // (cudaDeviceReset());
+ cudaDeviceReset();
return true;
}
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h
index df640f5..fb2c9a9 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -114,9 +114,7 @@ class cuda_multicorrelator
{
public:
cuda_multicorrelator();
- bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
bool init_cuda_integrated_resampler(
- const int argc, const char **argv,
int signal_length_samples,
int code_length_chips,
int n_correlators
@@ -127,19 +125,12 @@ public:
float *shifts_chips,
int n_correlators
);
+ bool set_input_output_vectors(
+ std::complex<float>* corr_out,
+ std::complex<float>* sig_in
+ );
bool free_cuda();
- bool Carrier_wipeoff_multicorrelator_cuda(
- std::complex<float>* corr_out,
- const std::complex<float>* sig_in,
- const std::complex<float>* local_codes_in,
- float rem_carrier_phase_in_rad,
- float phase_step_rad,
- const int *shifts_samples,
- int signal_length_samples,
- int n_correlators);
bool Carrier_wipeoff_multicorrelator_resampler_cuda(
- std::complex<float>* corr_out,
- const std::complex<float>* sig_in,
float rem_carrier_phase_in_rad,
float phase_step_rad,
float code_phase_step_chips,
@@ -154,6 +145,11 @@ private:
GPU_Complex *d_sig_doppler_wiped;
GPU_Complex *d_local_codes_in;
GPU_Complex *d_corr_out;
+
+ //
+ std::complex<float> *d_sig_in_cpu;
+ std::complex<float> *d_corr_out_cpu;
+
int *d_shifts_samples;
float *d_shifts_chips;
float d_code_length_chips;
@@ -162,7 +158,7 @@ private:
int blocksPerGrid;
cudaStream_t stream1;
- cudaStream_t stream2;
+ //cudaStream_t stream2;
int num_gpu_devices;
int selected_device;
};
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git
More information about the pkg-hamradio-commits
mailing list