[hamradio-commits] [gnss-sdr] 14/22: Adding CPU and GPU multicorrelators unit tests

Thu Jun 30 20:11:29 UTC 2016

This is an automated email from the git hooks/post-receive script.

carles_fernandez-guest pushed a commit to branch next
in repository gnss-sdr.

commit c8d7a607f850a677df7032ed5087d191ce0003f1
Author: Javier Arribas <jarribas at cttc.es>
Date:   Fri Jun 17 12:48:31 2016 +0200

    Adding CPU and GPU multicorrelators unit tests
---
 .../tracking/libs/cuda_multicorrelator.cu          |   2 +-
 src/tests/CMakeLists.txt                           |  10 ++
 src/tests/arithmetic/cpu_multicorrelator_test.cc   | 167 +++++++++++++++++++++
 src/tests/arithmetic/gpu_multicorrelator_test.cc   | 166 ++++++++++++++++++++
 src/tests/test_main.cc                             |   6 +
 5 files changed, 350 insertions(+), 1 deletion(-)

diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
index 61dc305..78c3612 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -198,7 +198,7 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
 	//********* ZERO COPY VERSION ************
 	// Set flag to enable zero copy access
     // Optimal in shared memory devices (like Jetson K1)
-	cudaSetDeviceFlags(cudaDeviceMapHost);
+	//cudaSetDeviceFlags(cudaDeviceMapHost);
 
 	//******** CudaMalloc version ***********
 
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index ce8c40e..08ad88e 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -33,6 +33,7 @@ if(GTEST_INCLUDE_DIRS)
     set(GTEST_DIR_LOCAL true)
 endif(GTEST_INCLUDE_DIRS)
 
+
 if(NOT ${GTEST_DIR_LOCAL})
      # if GTEST_DIR is not defined, we download and build it
      set(gtest_RELEASE 1.7.0)
@@ -86,6 +87,11 @@ endif(NOT ${GTEST_DIR_LOCAL})
 set(GNSS_SDR_TEST_OPTIONAL_LIBS "")
 set(GNSS_SDR_TEST_OPTIONAL_HEADERS "")
 
+if(ENABLE_CUDA)
+    set(GNSS_SDR_TEST_OPTIONAL_HEADERS ${GNSS_SDR_TEST_OPTIONAL_HEADERS} ${CUDA_INCLUDE_DIRS})
+    set(GNSS_SDR_TEST_OPTIONAL_LIBS ${GNSS_SDR_TEST_OPTIONAL_LIBS} ${CUDA_LIBRARIES})
+endif(ENABLE_CUDA)
+
 if(ENABLE_GPERFTOOLS)
     if(GPERFTOOLS_FOUND)
         set(GNSS_SDR_TEST_OPTIONAL_LIBS "${GNSS_SDR_TEST_OPTIONAL_LIBS};${GPERFTOOLS_LIBRARIES}")
@@ -152,6 +158,10 @@ if(OPENCL_FOUND)
     add_definitions(-DOPENCL_BLOCKS_TEST=1)
 endif(OPENCL_FOUND)
 
+if (ENABLE_CUDA)
+	add_definitions(-DCUDA_BLOCKS_TEST=1)
+endif(ENABLE_CUDA)
+
 add_definitions(-DTEST_PATH="${CMAKE_SOURCE_DIR}/src/tests/")
 
 
diff --git a/src/tests/arithmetic/cpu_multicorrelator_test.cc b/src/tests/arithmetic/cpu_multicorrelator_test.cc
new file mode 100644
index 0000000..d84b38f
--- /dev/null
+++ b/src/tests/arithmetic/cpu_multicorrelator_test.cc
@@ -0,0 +1,167 @@
+/*!
+ * \file fft_length_test.cc
+ * \brief  This file implements timing tests for the FFT.
+ * \author Carles Fernandez-Prades, 2016. cfernandez(at)cttc.es
+ *
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2016  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include <ctime>
+#include <complex>
+#include <thread>
+#include <volk/volk.h>
+#include "cpu_multicorrelator.h"
+#include "gps_sdr_signal_processing.h"
+#include "GPS_L1_CA.h"
+
+
+DEFINE_int32(cpu_multicorrelator_iterations_test, 1000, "Number of averaged iterations in CPU multicorrelator test timing test");
+DEFINE_int32(cpu_multicorrelator_max_threads_test, 12, "Number of maximum concurrent correlators in CPU multicorrelator test timing test");
+
+void run_correlator_cpu(cpu_multicorrelator* correlator,
+                    float d_rem_carrier_phase_rad,
+                    float d_carrier_phase_step_rad,
+                    float d_code_phase_step_chips,
+                    float d_rem_code_phase_chips,
+                    int correlation_size,
+                    int d_n_correlator_taps)
+{
+    correlator->Carrier_wipeoff_multicorrelator_resampler(d_rem_carrier_phase_rad,
+                                                               d_carrier_phase_step_rad,
+                                                               d_code_phase_step_chips,
+                                                               d_rem_code_phase_chips,
+                                                               correlation_size);
+}
+
+TEST(CPU_multicorrelator_test, MeasureExecutionTime)
+{
+    struct timeval tv;
+    int max_threads=FLAGS_cpu_multicorrelator_max_threads_test;
+    std::vector<std::thread> thread_pool;
+    cpu_multicorrelator* correlator_pool[max_threads];
+    unsigned int correlation_sizes [3] = { 2048, 4096, 8192};
+    double execution_times [3];
+
+    gr_complex* d_ca_code;
+    gr_complex* in_cpu;
+    gr_complex* d_correlator_outs;
+
+    int d_n_correlator_taps=3;
+    int d_vector_length=correlation_sizes[2]; //max correlation size to allocate all the necessary memory
+    float* d_local_code_shift_chips;
+
+    //allocate host memory
+    // Get space for a vector with the C/A code replica sampled 1x/chip
+    d_ca_code = static_cast<gr_complex*>(volk_malloc(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));
+    in_cpu = static_cast<gr_complex*>(volk_malloc(2 * d_vector_length * sizeof(gr_complex), volk_get_alignment()));
+
+    // correlator outputs (scalar)
+    d_n_correlator_taps = 3; // Early, Prompt, and Late
+    d_correlator_outs = static_cast<gr_complex*>(volk_malloc(d_n_correlator_taps*sizeof(gr_complex), volk_get_alignment()));
+    for (int n = 0; n < d_n_correlator_taps; n++)
+    {
+        d_correlator_outs[n] = gr_complex(0,0);
+    }
+    d_local_code_shift_chips = static_cast<float*>(volk_malloc(d_n_correlator_taps*sizeof(float), volk_get_alignment()));
+    // Set TAPs delay values [chips]
+    float d_early_late_spc_chips=0.5;
+    d_local_code_shift_chips[0] = - d_early_late_spc_chips;
+    d_local_code_shift_chips[1] = 0.0;
+    d_local_code_shift_chips[2] = d_early_late_spc_chips;
+
+    //--- Perform initializations ------------------------------
+
+    //local code resampler on GPU
+    // generate local reference (1 sample per chip)
+    gps_l1_ca_code_gen_complex(d_ca_code, 1, 0);
+    // generate inut signal
+    for (int n=0;n<2*d_vector_length;n++)
+    {
+        in_cpu[n]=std::complex<float>(static_cast <float> (rand())/static_cast<float>(RAND_MAX),static_cast <float> (rand())/static_cast<float>(RAND_MAX));
+    }
+
+    for (int n=0;n<max_threads;n++)
+    {
+        correlator_pool[n] = new cpu_multicorrelator();
+        correlator_pool[n]->init(d_vector_length, d_n_correlator_taps);
+        correlator_pool[n]->set_input_output_vectors(d_correlator_outs, in_cpu);
+        correlator_pool[n]->set_local_code_and_taps(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS), d_ca_code, d_local_code_shift_chips);
+    }
+
+    float d_rem_carrier_phase_rad=0.0;
+    float d_carrier_phase_step_rad=0.1;
+    float d_code_phase_step_chips=0.3;
+    float d_rem_code_phase_chips=0.4;
+
+    EXPECT_NO_THROW(
+        for(int correlation_sizes_idx = 0; correlation_sizes_idx < 3; correlation_sizes_idx++)
+        {
+            for(int current_max_threads=1; current_max_threads<(max_threads+1); current_max_threads++)
+            {
+                std::cout<<"Running "<<current_max_threads<<" concurrent correlators"<<std::endl;
+                gettimeofday(&tv, NULL);
+                long long int begin = tv.tv_sec * 1000000 + tv.tv_usec;
+                for(int k = 0; k < FLAGS_cpu_multicorrelator_iterations_test; k++)
+                {
+                    //create the concurrent correlator threads
+                    for (int current_thread=0;current_thread<current_max_threads;current_thread++)
+                    {
+                        thread_pool.push_back(std::thread(run_correlator_cpu,
+                                correlator_pool[current_thread],
+                                d_rem_carrier_phase_rad,
+                                d_carrier_phase_step_rad,
+                                d_code_phase_step_chips,
+                                d_rem_code_phase_chips,
+                                correlation_sizes[correlation_sizes_idx],
+                                d_n_correlator_taps));
+                    }
+                    //wait the threads to finish they work and destroy the thread objects
+                    for(auto &t : thread_pool){
+                    t.join();
+                    }
+                    thread_pool.clear();
+                }
+                gettimeofday(&tv, NULL);
+                long long int end = tv.tv_sec * 1000000 + tv.tv_usec;
+                execution_times[correlation_sizes_idx] = static_cast<double>(end - begin) / (1000000.0 * static_cast<double>(FLAGS_cpu_multicorrelator_iterations_test));
+                std::cout << "CPU Multicorrelator execution time for length=" << correlation_sizes[correlation_sizes_idx] << " : " << execution_times[correlation_sizes_idx] << " [s]" << std::endl;
+
+            }
+        }
+    );
+
+
+    volk_free(d_local_code_shift_chips);
+    volk_free(d_correlator_outs);
+    volk_free(d_ca_code);
+    volk_free(in_cpu);
+
+    for (int n=0;n<max_threads;n++)
+    {
+        correlator_pool[n]->free();
+        delete(correlator_pool[n]);
+    }
+}
diff --git a/src/tests/arithmetic/gpu_multicorrelator_test.cc b/src/tests/arithmetic/gpu_multicorrelator_test.cc
new file mode 100644
index 0000000..11c4fde
--- /dev/null
+++ b/src/tests/arithmetic/gpu_multicorrelator_test.cc
@@ -0,0 +1,166 @@
+/*!
+ * \file fft_length_test.cc
+ * \brief  This file implements timing tests for the FFT.
+ * \author Carles Fernandez-Prades, 2016. cfernandez(at)cttc.es
+ *
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2016  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include <ctime>
+#include <complex>
+#include <thread>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "cuda_multicorrelator.h"
+#include "gps_sdr_signal_processing.h"
+#include "GPS_L1_CA.h"
+
+
+DEFINE_int32(gpu_multicorrelator_iterations_test, 1000, "Number of averaged iterations in GPU multicorrelator test timing test");
+DEFINE_int32(gpu_multicorrelator_max_threads_test, 12, "Number of maximum concurrent correlators in GPU multicorrelator test timing test");
+
+void run_correlator_gpu(cuda_multicorrelator* correlator,
+                    float d_rem_carrier_phase_rad,
+                    float d_carrier_phase_step_rad,
+                    float d_code_phase_step_chips,
+                    float d_rem_code_phase_chips,
+                    int correlation_size,
+                    int d_n_correlator_taps)
+{
+    correlator->Carrier_wipeoff_multicorrelator_resampler_cuda(d_rem_carrier_phase_rad,
+                                                                        d_carrier_phase_step_rad,
+                                                                        d_code_phase_step_chips,
+                                                                        d_rem_code_phase_chips,
+                                                                        correlation_size,
+                                                                        d_n_correlator_taps);
+}
+
+TEST(GPU_multicorrelator_test, MeasureExecutionTime)
+{
+    struct timeval tv;
+    int max_threads=FLAGS_gpu_multicorrelator_max_threads_test;
+    std::vector<std::thread> thread_pool;
+    cuda_multicorrelator* correlator_pool[max_threads];
+    unsigned int correlation_sizes [3] = { 2048, 4096, 8192};
+    double execution_times [3];
+
+    gr_complex* d_ca_code;
+    gr_complex* in_gpu;
+    gr_complex* d_correlator_outs;
+
+    int d_n_correlator_taps=3;
+    int d_vector_length=correlation_sizes[2]; //max correlation size to allocate all the necessary memory
+    float* d_local_code_shift_chips;
+    // Set GPU flags
+    cudaSetDeviceFlags(cudaDeviceMapHost);
+    //allocate host memory
+    //pinned memory mode - use special function to get OS-pinned memory
+    d_n_correlator_taps = 3; // Early, Prompt, and Late
+    // Get space for a vector with the C/A code replica sampled 1x/chip
+    cudaHostAlloc((void**)&d_ca_code, (static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS)* sizeof(gr_complex)), cudaHostAllocMapped | cudaHostAllocWriteCombined);
+    // Get space for the resampled early / prompt / late local replicas
+    cudaHostAlloc((void**)&d_local_code_shift_chips, d_n_correlator_taps * sizeof(float),  cudaHostAllocMapped | cudaHostAllocWriteCombined);
+    cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped | cudaHostAllocWriteCombined);
+    // correlator outputs (scalar)
+    cudaHostAlloc((void**)&d_correlator_outs ,sizeof(gr_complex)*d_n_correlator_taps, cudaHostAllocMapped |  cudaHostAllocWriteCombined );
+
+    //--- Perform initializations ------------------------------
+    //local code resampler on GPU
+    // generate local reference (1 sample per chip)
+    gps_l1_ca_code_gen_complex(d_ca_code, 1, 0);
+    // generate inut signal
+    for (int n=0;n<2*d_vector_length;n++)
+    {
+        in_gpu[n]=std::complex<float>(static_cast <float> (rand())/static_cast<float>(RAND_MAX),static_cast <float> (rand())/static_cast<float>(RAND_MAX));
+    }
+    // Set TAPs delay values [chips]
+    float d_early_late_spc_chips=0.5;
+    d_local_code_shift_chips[0] = - d_early_late_spc_chips;
+    d_local_code_shift_chips[1] = 0.0;
+    d_local_code_shift_chips[2] = d_early_late_spc_chips;
+    for (int n=0;n<max_threads;n++)
+    {
+        correlator_pool[n] = new cuda_multicorrelator();
+        correlator_pool[n]->init_cuda_integrated_resampler(d_vector_length, GPS_L1_CA_CODE_LENGTH_CHIPS, d_n_correlator_taps);
+        correlator_pool[n]->set_input_output_vectors(d_correlator_outs, in_gpu);
+    }
+
+    float d_rem_carrier_phase_rad=0.0;
+    float d_carrier_phase_step_rad=0.1;
+    float d_code_phase_step_chips=0.3;
+    float d_rem_code_phase_chips=0.4;
+
+    EXPECT_NO_THROW(
+        for(int correlation_sizes_idx = 0; correlation_sizes_idx < 3; correlation_sizes_idx++)
+            {
+                for(int current_max_threads=1; current_max_threads<(max_threads+1); current_max_threads++)
+                {
+                    std::cout<<"Running "<<current_max_threads<<" concurrent correlators"<<std::endl;
+                    gettimeofday(&tv, NULL);
+                    long long int begin = tv.tv_sec * 1000000 + tv.tv_usec;
+                    for(int k = 0; k < FLAGS_gpu_multicorrelator_iterations_test; k++)
+                    {
+                        //create the concurrent correlator threads
+                        for (int current_thread=0;current_thread<current_max_threads;current_thread++)
+                        {
+                            thread_pool.push_back(std::thread(run_correlator_gpu,
+                                    correlator_pool[current_thread],
+                                    d_rem_carrier_phase_rad,
+                                    d_carrier_phase_step_rad,
+                                    d_code_phase_step_chips,
+                                    d_rem_code_phase_chips,
+                                    correlation_sizes[correlation_sizes_idx],
+                                    d_n_correlator_taps));
+                        }
+                        //wait the threads to finish they work and destroy the thread objects
+                        for(auto &t : thread_pool){
+                             t.join();
+                         }
+                        thread_pool.clear();
+                    }
+                    gettimeofday(&tv, NULL);
+                    long long int end = tv.tv_sec * 1000000 + tv.tv_usec;
+                    execution_times[correlation_sizes_idx] = static_cast<double>(end - begin) / (1000000.0 * static_cast<double>(FLAGS_gpu_multicorrelator_iterations_test));
+                    std::cout << "GPU Multicorrelator execution time for length=" << correlation_sizes[correlation_sizes_idx] << " : " << execution_times[correlation_sizes_idx] << " [s]" << std::endl;
+
+                }
+    		}
+    );
+
+    cudaFreeHost(in_gpu);
+    cudaFreeHost(d_correlator_outs);
+    cudaFreeHost(d_local_code_shift_chips);
+    cudaFreeHost(d_ca_code);
+
+    for (int n=0;n<max_threads;n++)
+    {
+        correlator_pool[n]->free_cuda();
+        delete(correlator_pool[n]);
+    }
+
+
+
+}
diff --git a/src/tests/test_main.cc b/src/tests/test_main.cc
index a3dabce..5b3228f 100644
--- a/src/tests/test_main.cc
+++ b/src/tests/test_main.cc
@@ -92,9 +92,15 @@ DECLARE_string(log_dir);
 #include "gnss_block/gps_l2_m_pcps_acquisition_test.cc"
 #include "gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc"
 //#include "gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc"
+#include "arithmetic/cpu_multicorrelator_test.cc"
 #if OPENCL_BLOCKS_TEST
 #include "gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc"
 #endif
+
+#if CUDA_BLOCKS_TEST
+	#include "arithmetic/gpu_multicorrelator_test.cc"
+#endif
+
 #include "gnss_block/gps_l1_ca_pcps_quicksync_acquisition_gsoc2014_test.cc"
 #include "gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc"
 #include "gnss_block/galileo_e1_pcps_ambiguous_acquisition_test.cc"

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git