[hamradio-commits] [gnss-sdr] 135/149: Fixing missing phase increment in SIMD implementations

Sat Feb 6 19:43:13 UTC 2016

This is an automated email from the git hooks/post-receive script.

carles_fernandez-guest pushed a commit to branch next
in repository gnss-sdr.

commit db321d1c2ed48754c39bebf2c253d0cd9a4c4177
Merge: 8c07815 a262552
Author: Carles Fernandez <carles.fernandez at gmail.com>
Date:   Fri Jan 29 19:42:30 2016 +0100

    Fixing missing phase increment in SIMD implementations
    
    After computing the rotation with SIMD instructions, we were not
    incrementing the phase step, so the first iteration in the 'c region'
    had the same phase than the last sample computed with SIMD instructions.
    This commit fix the bux in SSE3 and NEON implementations

 .../volk_gnsssdr_16ic_rotatorpuppet_16ic.h         |  16 +-
 .../volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h      | 282 ++++++++++-----------
 2 files changed, 140 insertions(+), 158 deletions(-)

diff --cc src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h
index e3421ba,aa4fd20..c86b460

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h
@@@ -72,198 -73,181 +73,179 @@@ static inline void volk_gnsssdr_16ic_s3
  #endif /* LV_HAVE_GENERIC */
  
  
- #ifdef LV_HAVE_SSE2
- #include <emmintrin.h>
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
  
- static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
-     __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
+     __m128 a,b, two_phase_acc_reg,two_phase_inc_reg;
+     __m128i c1,c2,result;
+     __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
 -    two_phase_inc[0]=phase_inc*phase_inc;
 -    two_phase_inc[1]=phase_inc*phase_inc;
 -    two_phase_inc_reg=_mm_load_ps((float*) two_phase_inc);
++    two_phase_inc[0] = phase_inc*phase_inc;
++    two_phase_inc[1] = phase_inc*phase_inc;
++    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+     __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
 -    two_phase_acc[0]=(*phase);
 -    two_phase_acc[1]=(*phase)*phase_inc;
 -    two_phase_acc_reg=_mm_load_ps((float*) two_phase_acc);
++    two_phase_acc[0] = (*phase);
++    two_phase_acc[1] = (*phase) * phase_inc;
++    two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
  
-     mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-     mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
- 
-     const lv_16sc_t* _in_a = inVector;
-     __attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
-     // debug
-     //__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
- 
-     // specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
-     __attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
-     __m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
+     const lv_16sc_t* _in = inVector;
  
-     //const lv_16sc_t* _in_b = in_b;
      lv_16sc_t* _out = outVector;
  
-     __m128 fc_reg1, fc_reg2;
-     __m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
 -	__m128 yl, yh, tmp1, tmp2, tmp3;
 -
 -	for(unsigned int number = 0; number < sse_iters; number++)
 -	{
 -		a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
 -		//complex 32fc multiplication b=a*two_phase_acc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -		c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
 -
 -		//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -
 -		//next two samples
 -		_in += 2;
 -		a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
 -		//complex 32fc multiplication b=a*two_phase_acc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -		c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
 -
 -		//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -
 -		// store four output samples
 -		result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
 -		_mm_store_si128((__m128i*)_out, result);
 -
 -		//next two samples
 -		_in += 2;
 -		_out += 4;
 -	}
++    __m128 yl, yh, tmp1, tmp2, tmp3;
 +
 +    for(unsigned int number = 0; number < sse_iters; number++)
 +        {
-             //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-             //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
-             // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-             a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-             //b = _mm_loadu_si128((__m128i*)_in_b);
- 
-             // compute next four 16ic complex exponential values for phase rotation
- 
-             // compute next four float complex rotations
-             four_phase_rotations_32fc[0]=*phase;
-             (*phase) *= phase_inc;
-             four_phase_rotations_32fc[1]=*phase;
-             (*phase) *= phase_inc;
-             four_phase_rotations_32fc[2]=*phase;
-             (*phase) *= phase_inc;
-             four_phase_rotations_32fc[3]=*phase;
-             (*phase) *= phase_inc;
-             //convert the rotations to integers
-             fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
- 
-             // disable next line for 1 bit rotation (equivalent to a square wave NCO)
-             fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
- 
-             fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
-             sc_reg1 = _mm_cvtps_epi32(fc_reg1);
-             sc_reg2 = _mm_cvtps_epi32(fc_reg2);
-             b = _mm_packs_epi32(sc_reg1, sc_reg2);
- 
-             // debug
-             //_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
-             //printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
- 
-             // multiply the input vector times the rotations
-             c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
- 
-             c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-             real = _mm_subs_epi16 (c, c_sr);
-             real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
- 
-             b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-             a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
- 
-             imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-             imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
- 
-             imag = _mm_adds_epi16(imag1, imag2);
-             imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
- 
-             result = _mm_or_si128 (real, imag);
- 
-             // normalize the rotations
-             // TODO
- 
-             // store results
++            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++            //complex 32fc multiplication b=a*two_phase_acc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++            c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
++
++            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++            //next two samples
++            _in += 2;
++            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++            //complex 32fc multiplication b=a*two_phase_acc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++            c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
++
++            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++            // store four output samples
++            result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
 +            _mm_store_si128((__m128i*)_out, result);
 +
-             _in_a += 4;
++            //next two samples
++            _in += 2;
 +            _out += 4;
 +        }
  
+     _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
 -    (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]);
++    (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]) * phase_inc;
+     lv_16sc_t tmp16;
+     lv_32fc_t tmp32;
      for (unsigned int i = sse_iters * 4; i < num_points; ++i)
          {
-             *_out++ = *_in_a++ * (*phase);
 -			tmp16 = *_in++;
 -			tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
 -			*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
 -			(*phase) *= phase_inc;
 -
++            tmp16 = *_in++;
++            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
++            *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
 +            (*phase) *= phase_inc;
          }
  }
- #endif /* LV_HAVE_SSE2 */
  
+ #endif /* LV_HAVE_SSE3 */
  
- #ifdef LV_HAVE_SSE2
- #include <emmintrin.h>
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
  
- static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
  {
      const unsigned int sse_iters = num_points / 4;
-     __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
- 
-     mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-     mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+     __m128 a,b, two_phase_acc_reg,two_phase_inc_reg;
+     __m128i c1,c2,result;
+     __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
 -    two_phase_inc[0]=phase_inc*phase_inc;
 -    two_phase_inc[1]=phase_inc*phase_inc;
 -    two_phase_inc_reg=_mm_load_ps((float*) two_phase_inc);
++    two_phase_inc[0] = phase_inc*phase_inc;
++    two_phase_inc[1] = phase_inc*phase_inc;
++    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+     __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
 -    two_phase_acc[0]=(*phase);
 -    two_phase_acc[1]=(*phase)*phase_inc;
 -    two_phase_acc_reg=_mm_load_ps((float*) two_phase_acc);
++    two_phase_acc[0] = (*phase);
++    two_phase_acc[1] = (*phase)*phase_inc;
++    two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
  
-     const lv_16sc_t* _in_a = inVector;
-     __attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
-     // debug
-     //__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
- 
-     // specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
-     __attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
-     __m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
+     const lv_16sc_t* _in = inVector;
  
-     //const lv_16sc_t* _in_b = in_b;
      lv_16sc_t* _out = outVector;
  
-     __m128 fc_reg1, fc_reg2;
-     __m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
 -	__m128 yl, yh, tmp1, tmp2, tmp3;
 -
 -	for(unsigned int number = 0; number < sse_iters; number++)
 -	{
 -		a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
 -		//complex 32fc multiplication b=a*two_phase_acc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -		c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
 -
 -		//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -
 -		//next two samples
 -		_in += 2;
 -		a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
 -		//complex 32fc multiplication b=a*two_phase_acc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -		c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
 -
 -		//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
 -		yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
 -		yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
 -		tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
 -		tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
 -		tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
 -		two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 -
 -		// store four output samples
 -		result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
 -		_mm_storeu_si128((__m128i*)_out, result);
 -
 -		//next two samples
 -		_in += 2;
 -		_out += 4;
 -	}
++    __m128 yl, yh, tmp1, tmp2, tmp3;
 +
 +    for(unsigned int number = 0; number < sse_iters; number++)
 +        {
-             //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-             //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
-             // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-             a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-             //b = _mm_loadu_si128((__m128i*)_in_b);
- 
-             // compute next four 16ic complex exponential values for phase rotation
- 
-             // compute next four float complex rotations
-             four_phase_rotations_32fc[0]=*phase;
-             (*phase) *= phase_inc;
-             four_phase_rotations_32fc[1]=*phase;
-             (*phase) *= phase_inc;
-             four_phase_rotations_32fc[2]=*phase;
-             (*phase) *= phase_inc;
-             four_phase_rotations_32fc[3]=*phase;
-             (*phase) *= phase_inc;
-             //convert the rotations to integers
-             fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
- 
-             // disable next line for 1 bit rotation (equivalent to a square wave NCO)
-             fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
- 
-             fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
-             sc_reg1 = _mm_cvtps_epi32(fc_reg1);
-             sc_reg2 = _mm_cvtps_epi32(fc_reg2);
-             b = _mm_packs_epi32(sc_reg1, sc_reg2);
- 
-             // debug
-             //_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
-             //printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
- 
-             // multiply the input vector times the rotations
-             c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
- 
-             c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-             real = _mm_subs_epi16 (c, c_sr);
-             real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
- 
-             b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-             a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
- 
-             imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-             imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
- 
-             imag = _mm_adds_epi16(imag1, imag2);
-             imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
- 
-             result = _mm_or_si128 (real, imag);
- 
-             // normalize the rotations
-             // TODO
- 
-             // store results
++            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++            //complex 32fc multiplication b=a*two_phase_acc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++            c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
++
++            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++            //next two samples
++            _in += 2;
++            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++            //complex 32fc multiplication b=a*two_phase_acc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++            c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
++
++            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
++            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
++            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
++            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
++            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++            // store four output samples
++            result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
 +            _mm_storeu_si128((__m128i*)_out, result);
 +
-             _in_a += 4;
++            //next two samples
++            _in += 2;
 +            _out += 4;
 +        }
  
+     _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
 -    (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]);
++    (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]) * phase_inc;
+     lv_16sc_t tmp16;
+     lv_32fc_t tmp32;
      for (unsigned int i = sse_iters * 4; i < num_points; ++i)
          {
-             *_out++ = *_in_a++ * (*phase);
 -			tmp16 = *_in++;
 -			tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
 -			*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
 -			(*phase) *= phase_inc;
 -
++            tmp16 = *_in++;
++            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
++            *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
 +            (*phase) *= phase_inc;
          }
  }
- #endif /* LV_HAVE_SSE2 */
+ 
+ #endif /* LV_HAVE_SSE3 */
  
  #ifdef LV_HAVE_NEON
  #include <arm_neon.h>

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git