[hamradio-commits] [gnss-sdr] 51/149: introducing new kernels

Sat Feb 6 19:43:02 UTC 2016

This is an automated email from the git hooks/post-receive script.

carles_fernandez-guest pushed a commit to branch next
in repository gnss-sdr.

commit eb1dbfe37b66951dd1f0d1d18cebac55e4771874
Author: Carles Fernandez <carles.fernandez at gmail.com>
Date:   Tue Jan 12 22:48:59 2016 +0100

    introducing new kernels
---
 .../include/volk_gnsssdr}/saturated_arithmetic.h   |   0
 .../volk_gnsssdr_16ic_resampler_16ic.h             | 170 ++++++++++++++++++++
 .../volk_gnsssdr_16ic_x2_dot_prod_16ic.h           | 152 ++++++++++++++++++
 .../volk_gnsssdr/lib/kernel_tests.h                |   2 +
 .../volk_gnsssdr/lib/qa_utils.h                    |   8 +
 .../tracking/libs/cpu_multicorrelator_16sc.cc      |   7 +-
 .../libs/volk_gnsssdr_16ic_resampler_16ic.h        | 171 ---------------------
 .../libs/volk_gnsssdr_16ic_x2_dot_prod_16ic.h      | 152 ------------------
 .../libs/volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h   | 152 +++++++++---------
 9 files changed, 409 insertions(+), 405 deletions(-)

diff --git a/src/algorithms/tracking/libs/saturated_arithmetic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturated_arithmetic.h
similarity index 100%
rename from src/algorithms/tracking/libs/saturated_arithmetic.h
rename to src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturated_arithmetic.h
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_16ic.h
new file mode 100644
index 0000000..6232771
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_16ic.h
@@ -0,0 +1,170 @@
+/*!
+ * \file volk_gnsssdr_16ic_resampler_16ic.h
+ * \brief Volk protokernel: resample a 16 bits complex vector
+ * \authors <ul>
+ *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ *          </ul>
+ *
+ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part 
+ * and 8 bits the imaginary part) and accumulates them
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
+#define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
+
+#include <math.h>
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <volk_gnsssdr/saturated_arithmetic.h>
+
+//#pragma STDC FENV_ACCESS ON
+
+#ifdef LV_HAVE_GENERIC
+
+//int round_int( float r ) {
+//    return (r > 0.0) ? (r + 0.5) : (r - 0.5);
+//}
+/*!
+ \brief Multiplies the two input complex vectors, point-by-point, storing the result in the third vector
+ \param cVector The vector where the result will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_16ic_resampler_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips)
+{
+    int local_code_chip_index;
+    //fesetround(FE_TONEAREST);
+    for (unsigned int n = 0; n < num_output_samples; n++)
+        {
+            // resample code for current tap
+            local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f);
+            if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
+            if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips;
+            //std::cout<<"g["<<n<<"]="<<code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f<<","<<local_code_chip_index<<" ";
+            result[n] = local_code[local_code_chip_index];
+        }
+    //std::cout<<std::endl;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips)//, int* scratch_buffer, float* scratch_buffer_float)
+{
+    _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+    unsigned int number;
+    const unsigned int quarterPoints = num_output_samples / 4;
+
+    lv_16sc_t* _result = result;
+
+    __attribute__((aligned(16))) int local_code_chip_index[4];
+    __m128 _rem_code_phase, _code_phase_step_chips;
+    __m128i _code_length_chips, _code_length_chips_minus1;
+    __m128 _code_phase_out, _code_phase_out_with_offset;
+    rem_code_phase_chips = rem_code_phase_chips - 0.5f;
+
+    _rem_code_phase =  _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
+    _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
+    __attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
+    four_times_code_length_chips_minus1[0] = code_length_chips-1;
+    four_times_code_length_chips_minus1[1] = code_length_chips-1;
+    four_times_code_length_chips_minus1[2] = code_length_chips-1;
+    four_times_code_length_chips_minus1[3] = code_length_chips-1;
+
+    __attribute__((aligned(16))) int four_times_code_length_chips[4];
+    four_times_code_length_chips[0] = code_length_chips;
+    four_times_code_length_chips[1] = code_length_chips;
+    four_times_code_length_chips[2] = code_length_chips;
+    four_times_code_length_chips[3] = code_length_chips;
+
+    _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
+    _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
+
+    __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
+
+    __m128i zero = _mm_setzero_si128();
+
+    __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
+    __m128 _4output_index = _mm_load_ps(init_idx_float);
+    __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
+    __m128 _4constant_float = _mm_load_ps(init_4constant_float);
+
+    //__attribute__((aligned(16))) int output_indexes[4];
+
+    for(number = 0; number < quarterPoints; number++)
+        {
+            _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
+            _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
+            _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
+
+            negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
+            _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
+            //_code_phase_out_int_over=_mm_or_si128(_mm_and_si128(_code_phase_out_int_neg,_code_phase_out_int),_mm_andnot_si128(negative_indexes,_code_phase_out_int));
+            _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
+
+            overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
+            _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
+            _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
+
+            _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
+
+            //_mm_store_ps((float*)_scratch_buffer_float,_code_phase_out_with_offset);
+
+            //todo: optimize the local code lookup table with intrinsics, if possible
+            *_result++ = local_code[local_code_chip_index[0]];
+            *_result++ = local_code[local_code_chip_index[1]];
+            *_result++ = local_code[local_code_chip_index[2]];
+            *_result++ = local_code[local_code_chip_index[3]];
+
+            _4output_index = _mm_add_ps(_4output_index, _4constant_float);
+            //_scratch_buffer_float+=4;
+
+        }
+
+    for(number = quarterPoints * 4; number < num_output_samples; number++)
+        {
+            local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
+            if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
+            if (local_code_chip_index[0] > (code_length_chips - 1)) local_code_chip_index[0] -= code_length_chips;
+            *_result++ = local_code[local_code_chip_index[0]];
+            //*_scratch_buffer_float++=code_phase_step_chips*static_cast<float>(number)+rem_code_phase_chips;
+        }
+
+    //    for(unsigned int n=0;n<num_output_samples;n++)
+    //    {
+    //
+    //		std::cout<<"s["<<n<<"]="<<scratch_buffer_float[n]<<","<<scratch_buffer[n]<<" ";
+    //    }
+    //    std::cout<<std::endl;
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+#endif /*INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H*/
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
new file mode 100644
index 0000000..aac703a
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
@@ -0,0 +1,152 @@
+/*!
+ * \file volk_gnsssdr_16ic_x2_dot_prod_16ic.h
+ * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
+ * \authors <ul>
+ *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ *          </ul>
+ *
+ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part 
+ * and 8 bits the imaginary part) and accumulates them
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
+#define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <volk_gnsssdr/saturated_arithmetic.h>
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+    result[0] = lv_cmake((int16_t)0, (int16_t)0);
+    for (unsigned int n = 0; n < num_points; n++)
+        {
+            //r*a.r - i*a.i, i*a.r + r*a.i
+            //result[0]+=in_a[n]*in_b[n];
+            lv_16sc_t tmp = in_a[n] * in_b[n];
+            result[0] = lv_cmake(sat_adds16b(lv_creal(result[0]), lv_creal(tmp)), sat_adds16b(lv_cimag(result[0]), lv_cimag(tmp) ));
+            //result[0].real(sat_adds16b(result[0].real(),lv_creal(tmp)));
+            //result[0].imag(sat_adds16b(result[0].imag(),tmp.imag()));
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
+{
+
+    lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+    const unsigned int sse_iters = num_points / 4;
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+
+    if (sse_iters > 0)
+        {
+            __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result;
+
+            realcacc = _mm_setzero_si128();
+            imagcacc = _mm_setzero_si128();
+
+            mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+            mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+            for(unsigned int number = 0; number < sse_iters; number++)
+                {
+                    //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
+                    //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
+                    // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+                    a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+                    b = _mm_loadu_si128((__m128i*)_in_b);
+                    c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+                    c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+                    real = _mm_subs_epi16 (c,c_sr);
+
+                    b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+                    a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+                    imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+                    imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+                    imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic!
+
+                    realcacc = _mm_adds_epi16 (realcacc, real);
+                    imagcacc = _mm_adds_epi16 (imagcacc, imag);
+
+                    _in_a += 4;
+                    _in_b += 4;
+                }
+
+            realcacc = _mm_and_si128 (realcacc, mask_real);
+            imagcacc = _mm_and_si128 (imagcacc, mask_imag);
+
+            result = _mm_or_si128 (realcacc, imagcacc);
+
+            __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+            _mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
+
+            for (int i = 0; i < 4; ++i)
+                {
+                    dotProduct = lv_cmake(sat_adds16b(lv_creal(dotProduct), lv_creal(dotProductVector[i])), sat_adds16b(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                    //dotProduct.real(sat_adds16b(lv_creal(dotProduct),lv_creal(dotProductVector[i])));
+                    //dotProduct.imag(sat_adds16b(lv_cimag(dotProduct),lv_cimag(dotProductVector[i])));
+                }
+        }
+
+    for (unsigned int i = 0; i < (num_points % 4); ++i)
+        {
+            //dotProduct += (*_in_a++) * (*_in_b++);
+            lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+            dotProduct = lv_cmake( sat_adds16b(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16b(lv_cimag(dotProduct), lv_cimag(tmp)));
+            //dotProduct.real(sat_adds16b(lv_creal(dotProduct),lv_creal(tmp)));
+            //dotProduct.imag(sat_adds16b(lv_cimag(dotProduct),lv_cimag(tmp)));
+        }
+
+    *_out = dotProduct;
+}
+#endif /* LV_HAVE_SSE2 */
+
+#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H*/
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
index 7efaea2..0665961 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
@@ -79,6 +79,8 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
         (VOLK_INIT_TEST(volk_gnsssdr_64f_accumulator_64f, test_params))
         (VOLK_INIT_TEST(volk_gnsssdr_32fc_convert_8ic, test_params))
         (VOLK_INIT_TEST(volk_gnsssdr_32fc_convert_16ic, test_params))
+        (VOLK_INIT_TEST(volk_gnsssdr_16ic_x2_dot_prod_16ic, test_params))
+        (VOLK_INIT_TEST(volk_gnsssdr_16ic_resampler_16ic, test_params))
         ;
 
     return test_cases;
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h
index 3163129..971d1c1 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h
@@ -170,6 +170,14 @@ typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const
 typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
 typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
 
+//typedef void (*volk_gnsssdr_fn_1arg_s16i)(void *, int16_t, unsigned int, const char*); //one input vector, one scalar int16_t input
+//typedef void (*volk_gnsssdr_fn_2arg_s16i)(void *, void *, int16_t, unsigned int, const char*);
+//typedef void (*volk_gnsssdr_fn_3arg_s16i)(void *, void *, void *, int16_t, unsigned int, const char*);
+//typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input
+//typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*);
+//typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_6arg_s16ic)(void *, void *, void *, void *, void *, void *, lv_16sc_t, unsigned int, const char*);
+
 typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
 typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
 typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
index 572ec32..e1cb2c2 100644
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
@@ -36,11 +36,14 @@
 #include <iostream>
 #include <gnuradio/fxpt.h>  // fixed point sine and cosine
 
+#include "volk_gnsssdr/volk_gnsssdr.h"
+
 #define LV_HAVE_GENERIC
 #define LV_HAVE_SSE2
-#include "volk_gnsssdr_16ic_x2_dot_prod_16ic.h"
+
+//#include "volk_gnsssdr_16ic_x2_dot_prod_16ic.h"
 #include "volk_gnsssdr_16ic_x2_multiply_16ic.h"
-#include "volk_gnsssdr_16ic_resampler_16ic.h"
+//#include "volk_gnsssdr_16ic_resampler_16ic.h"
 #include "volk_gnsssdr_16ic_xn_resampler_16ic_xn.h"
 #include "volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h"
 
diff --git a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_resampler_16ic.h b/src/algorithms/tracking/libs/volk_gnsssdr_16ic_resampler_16ic.h
deleted file mode 100644
index 4580225..0000000
--- a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_resampler_16ic.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*!
- * \file volk_gnsssdr_16ic_resampler_16ic.h
- * \brief Volk protokernel: resample a 16 bits complex vector
- * \authors <ul>
- *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
- *          </ul>
- *
- * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part 
- * and 8 bits the imaginary part) and accumulates them
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
-#define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
-
-#include <volk_gnsssdr/volk_gnsssdr_common.h>
-#include <volk_gnsssdr/volk_gnsssdr_complex.h>
-#include <cmath>
-//#pragma STDC FENV_ACCESS ON
-
-#ifdef LV_HAVE_GENERIC
-
-//int round_int( float r ) {
-//    return (r > 0.0) ? (r + 0.5) : (r - 0.5);
-//}
-/*!
- \brief Multiplies the two input complex vectors, point-by-point, storing the result in the third vector
- \param cVector The vector where the result will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
- */
-static inline void volk_gnsssdr_16ic_resampler_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips)
-{
-    int local_code_chip_index;
-    //fesetround(FE_TONEAREST);
-    for (unsigned int n = 0; n < num_output_samples; n++)
-        {
-			// resample code for current tap
-			local_code_chip_index = round(code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f);
-	    	if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
-	    	if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips;
-			//std::cout<<"g["<<n<<"]="<<code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f<<","<<local_code_chip_index<<" ";
-			result[n] = local_code[local_code_chip_index];
-        }
-    //std::cout<<std::endl;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, int code_length_chips)//, int* scratch_buffer, float* scratch_buffer_float)
-{
-
-	_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
-    unsigned int number;
-    const unsigned int quarterPoints = num_output_samples / 4;
-
-	lv_16sc_t* _result = result;
-
-    __attribute__((aligned(16))) int local_code_chip_index[4];
-    __m128 _rem_code_phase,_code_phase_step_chips;
-    __m128i _code_length_chips,_code_length_chips_minus1;
-    __m128 _code_phase_out,_code_phase_out_with_offset;
-    rem_code_phase_chips=rem_code_phase_chips-0.5f;
-
-    _rem_code_phase =  _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
-    _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
-    __attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
-    four_times_code_length_chips_minus1[0]=code_length_chips-1;
-    four_times_code_length_chips_minus1[1]=code_length_chips-1;
-    four_times_code_length_chips_minus1[2]=code_length_chips-1;
-    four_times_code_length_chips_minus1[3]=code_length_chips-1;
-
-    __attribute__((aligned(16))) int four_times_code_length_chips[4];
-    four_times_code_length_chips[0]=code_length_chips;
-    four_times_code_length_chips[1]=code_length_chips;
-    four_times_code_length_chips[2]=code_length_chips;
-    four_times_code_length_chips[3]=code_length_chips;
-
-    _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
-    _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
-
-    __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over;
-
-    __m128i zero=_mm_setzero_si128();
-
-
-    __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
-    __m128 _4output_index=_mm_load_ps(init_idx_float);
-    __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
-    __m128 _4constant_float=_mm_load_ps(init_4constant_float);
-
-    //__attribute__((aligned(16))) int output_indexes[4];
-
-    for(number=0;number < quarterPoints; number++){
-    	_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
-    	_code_phase_out_with_offset = _mm_add_ps(_code_phase_out,_rem_code_phase); //add the phase offset
-    	_code_phase_out_int=_mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
-
-    	negative_indexes=_mm_cmplt_epi32 (_code_phase_out_int, zero); //test for negative values
-    	_code_phase_out_int_neg=_mm_add_epi32(_code_phase_out_int,_code_length_chips); //the negative values branch
-    	//_code_phase_out_int_over=_mm_or_si128(_mm_and_si128(_code_phase_out_int_neg,_code_phase_out_int),_mm_andnot_si128(negative_indexes,_code_phase_out_int));
-    	_code_phase_out_int_neg=_mm_xor_si128(_code_phase_out_int,_mm_and_si128( negative_indexes,_mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
-
-    	overflow_indexes=_mm_cmpgt_epi32  (_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
-    	_code_phase_out_int_over=_mm_sub_epi32(_code_phase_out_int_neg,_code_length_chips); //the negative values branch
-    	_code_phase_out_int_over=_mm_xor_si128(_code_phase_out_int_neg,_mm_and_si128( overflow_indexes,_mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
-
-    	_mm_storeu_si128((__m128i*)local_code_chip_index,_code_phase_out_int_over); // Store the results back
-
-    	//_mm_store_ps((float*)_scratch_buffer_float,_code_phase_out_with_offset);
-
-    	//todo: optimize the local code lookup table with intrinsics, if possible
-    	*_result++=local_code[local_code_chip_index[0]];
-    	*_result++=local_code[local_code_chip_index[1]];
-    	*_result++=local_code[local_code_chip_index[2]];
-    	*_result++=local_code[local_code_chip_index[3]];
-
-    	_4output_index = _mm_add_ps(_4output_index,_4constant_float);
-    	//_scratch_buffer_float+=4;
-
-    }
-
-    for(number = quarterPoints * 4;number < num_output_samples; number++){
-    	local_code_chip_index[0]=static_cast<int>(code_phase_step_chips*static_cast<float>(number) + rem_code_phase_chips+0.5f);
-    	if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips-1;
-    	if (local_code_chip_index[0]  > (code_length_chips-1)) local_code_chip_index[0] -= code_length_chips;
-    	*_result++=local_code[local_code_chip_index[0]];
-    	//*_scratch_buffer_float++=code_phase_step_chips*static_cast<float>(number)+rem_code_phase_chips;
-    }
-
-//    for(unsigned int n=0;n<num_output_samples;n++)
-//    {
-//
-//		std::cout<<"s["<<n<<"]="<<scratch_buffer_float[n]<<","<<scratch_buffer[n]<<" ";
-//    }
-//    std::cout<<std::endl;
-
-
-
-
-}
-#endif /* LV_HAVE_SSE2 */
-
-#endif /*INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H*/
diff --git a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/tracking/libs/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
deleted file mode 100644
index 75164cd..0000000
--- a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*!
- * \file volk_gnsssdr_16ic_x2_dot_prod_16ic.h
- * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
- * \authors <ul>
- *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
- *          </ul>
- *
- * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part 
- * and 8 bits the imaginary part) and accumulates them
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
-#define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
-
-#include <volk_gnsssdr/volk_gnsssdr_common.h>
-#include <volk_gnsssdr/volk_gnsssdr_complex.h>
-#include <stdio.h>
-#include <string.h>
-#include "saturated_arithmetic.h"
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
- \param cVector The vector where the accumulated result will be stored
- \param aVector One of the vectors to be multiplied and accumulated
- \param bVector One of the vectors to be multiplied and accumulated
- \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
- */
-static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
-{
-	result[0]=lv_16sc_t(0,0);
-	for (unsigned int n=0;n<num_points;n++)
-	{
-		//r*a.r - i*a.i, i*a.r + r*a.i
-		//result[0]+=in_a[n]*in_b[n];
-		lv_16sc_t tmp=in_a[n]*in_b[n];
-		result[0].real(sat_adds16b(result[0].real(),tmp.real()));
-		result[0].imag(sat_adds16b(result[0].imag(),tmp.imag()));
-	}
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
-{
-
-	lv_16sc_t dotProduct=lv_16sc_t(0,0);
-
-    const unsigned int sse_iters = num_points / 4;
-
-	const lv_16sc_t* _in_a = in_a;
-	const lv_16sc_t* _in_b = in_b;
-	lv_16sc_t* _out = out;
-
-    if (sse_iters>0)
-        {
-
-
-		__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result;
-
-        realcacc = _mm_setzero_si128();
-        imagcacc = _mm_setzero_si128();
-
-        mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-        mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
-
-		for(unsigned int number = 0;number < sse_iters; number++)
-			{
-	    	//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-	    	//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
-	    	// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-	            a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-	            b = _mm_loadu_si128((__m128i*)_in_b);
-	            c=_mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
-
-	            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-	            real = _mm_subs_epi16 (c,c_sr);
-
-	            b_sl = _mm_slli_si128(b,2); // b3.r, b2.i ....
-	            a_sl = _mm_slli_si128(a,2); // a3.r, a2.i ....
-
-	            imag1 = _mm_mullo_epi16(a,b_sl); // a3.i*b3.r, ....
-	            imag2 = _mm_mullo_epi16(b,a_sl); // b3.i*a3.r, ....
-
-	            imag = _mm_adds_epi16(imag1,imag2); //with saturation aritmetic!
-
-                realcacc = _mm_adds_epi16 (realcacc, real);
-                imagcacc = _mm_adds_epi16 (imagcacc, imag);
-
-				_in_a += 4;
-				_in_b += 4;
-
-			}
-
-
-			realcacc = _mm_and_si128 (realcacc, mask_real);
-			imagcacc = _mm_and_si128 (imagcacc, mask_imag);
-
-			result = _mm_or_si128 (realcacc, imagcacc);
-
-			__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
-
-			_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
-
-			for (int i = 0; i<4; ++i)
-				{
-					dotProduct.real(sat_adds16b(dotProduct.real(),dotProductVector[i].real()));
-					dotProduct.imag(sat_adds16b(dotProduct.imag(),dotProductVector[i].imag()));
-				}
-        }
-
-    for (unsigned int i = 0; i<(num_points % 4); ++i)
-        {
-            //dotProduct += (*_in_a++) * (*_in_b++);
-    		lv_16sc_t tmp=(*_in_a++) * (*_in_b++);
-			dotProduct.real(sat_adds16b(dotProduct.real(),tmp.real()));
-			dotProduct.imag(sat_adds16b(dotProduct.imag(),tmp.imag()));
-
-        }
-
-    *_out = dotProduct;
-}
-#endif /* LV_HAVE_SSE2 */
-
-#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H*/
diff --git a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h b/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h
index f55a1b0..b133bac 100644
--- a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h
+++ b/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h
@@ -38,7 +38,7 @@
 
 
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
-#include "saturated_arithmetic.h"
+#include <volk_gnsssdr/saturated_arithmetic.h>
 
 #ifdef LV_HAVE_GENERIC
 /*!
@@ -50,18 +50,17 @@
  */
 static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors)
 {
-	for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
-	{
-		result[n_vec]=lv_cmake(0,0);
-		for (unsigned int n=0;n<num_points;n++)
-		{
-			//r*a.r - i*a.i, i*a.r + r*a.i
-			//result[n_vec]+=in_common[n]*in_a[n_vec][n];
-			lv_16sc_t tmp=in_common[n]*in_a[n_vec][n];
-			result[n_vec]=lv_cmake(sat_adds16b(lv_creal(result[n_vec]),lv_creal(tmp)),sat_adds16b(lv_cimag(result[n_vec]),lv_cimag(tmp)));
-		}
-
-	}
+    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            result[n_vec] = lv_cmake(0,0);
+            for (unsigned int n = 0; n < num_points; n++)
+                {
+                    //r*a.r - i*a.i, i*a.r + r*a.i
+                    //result[n_vec]+=in_common[n]*in_a[n_vec][n];
+                    lv_16sc_t tmp = in_common[n]*in_a[n_vec][n];
+                    result[n_vec] = lv_cmake(sat_adds16b(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16b(lv_cimag(result[n_vec]), lv_cimag(tmp)));
+                }
+        }
 }
 
 #endif /*LV_HAVE_GENERIC*/
@@ -71,99 +70,92 @@ static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_generic(lv_16sc_t* resu
 #include <emmintrin.h>
 static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors)
 {
-
-	lv_16sc_t dotProduct=lv_cmake(0,0);
+    lv_16sc_t dotProduct = lv_cmake(0,0);
 
     const unsigned int sse_iters = num_points / 4;
 
-	const lv_16sc_t** _in_a = in_a;
-	const lv_16sc_t* _in_common = in_common;
-	lv_16sc_t* _out = out;
+    const lv_16sc_t** _in_a = in_a;
+    const lv_16sc_t* _in_common = in_common;
+    lv_16sc_t* _out = out;
 
-    if (sse_iters>0)
+    if (sse_iters > 0)
         {
+            __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
 
-		__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
-
-		//todo dyn mem reg
-
-		__m128i* realcacc;
-		__m128i* imagcacc;
-
-    	realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
-    	imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
+            //todo dyn mem reg
 
+            __m128i* realcacc;
+            __m128i* imagcacc;
 
-		__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
+            realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
+            imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
 
-        mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-        mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+            __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
 
-		for(unsigned int number = 0;number < sse_iters; number++)
-			{
-	    	//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-	    	//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
-	    	// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+            mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+            mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
 
-            	b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-            	for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
-            	{
+            for(unsigned int number = 0; number < sse_iters; number++)
+                {
+                    //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
+                    //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
+                    // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
 
-					a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+                    b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+                    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        {
+                            a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
 
-					c=_mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
+                            c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
 
-					c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-					real = _mm_subs_epi16 (c,c_sr);
+                            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+                            real = _mm_subs_epi16 (c, c_sr);
 
-					b_sl = _mm_slli_si128(b,2); // b3.r, b2.i ....
-					a_sl = _mm_slli_si128(a,2); // a3.r, a2.i ....
+                            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+                            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
 
-					imag1 = _mm_mullo_epi16(a,b_sl); // a3.i*b3.r, ....
-					imag2 = _mm_mullo_epi16(b,a_sl); // b3.i*a3.r, ....
+                            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+                            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
 
-					imag = _mm_adds_epi16(imag1,imag2);
+                            imag = _mm_adds_epi16(imag1, imag2);
 
-					realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real);
-					imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag);
+                            realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real);
+                            imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag);
 
-            	}
-				_in_common += 4;
+                        }
+                    _in_common += 4;
+                }
 
-			}
+            for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
+                {
+                    realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
+                    imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
 
-			for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
-			{
-				realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
-				imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
+                    result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]);
 
-				result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]);
-
-				_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
-				dotProduct=lv_cmake(0,0);
-				for (int i = 0; i<4; ++i)
-					{
-
-				dotProduct=lv_cmake(sat_adds16b(lv_creal(dotProduct),lv_creal(dotProductVector[i])),
-						sat_adds16b(lv_cimag(dotProduct),lv_cimag(dotProductVector[i])));
-					}
-				_out[n_vec]=dotProduct;
-			}
-			free(realcacc);
-			free(imagcacc);
+                    _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
+                    dotProduct = lv_cmake(0,0);
+                    for (int i = 0; i<4; ++i)
+                        {
+                            dotProduct = lv_cmake(sat_adds16b(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                                    sat_adds16b(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                        }
+                    _out[n_vec] = dotProduct;
+                }
+            free(realcacc);
+            free(imagcacc);
         }
 
-	for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
-	{
-		for(unsigned int n  = sse_iters * 4;n < num_points; n++){
-
-				lv_16sc_t tmp=in_common[n]*in_a[n_vec][n];
+    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            for(unsigned int n  = sse_iters * 4;n < num_points; n++){
 
-				_out[n_vec]=lv_cmake(sat_adds16b(lv_creal(_out[n_vec]),lv_creal(tmp)),
-						sat_adds16b(lv_cimag(_out[n_vec]),lv_cimag(tmp)));
+                    lv_16sc_t tmp = in_common[n]*in_a[n_vec][n];
 
-			}
-	}
+                    _out[n_vec] = lv_cmake(sat_adds16b(lv_creal(_out[n_vec]), lv_creal(tmp)),
+                            sat_adds16b(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+            }
+        }
 
 }
 #endif /* LV_HAVE_SSE2 */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git