[hamradio-commits] [gnss-sdr] 130/236: adding _mm256_zeroupper() at the end of AVX and AVX2 protokernels
Carles Fernandez
carles_fernandez-guest at moszumanska.debian.org
Tue Apr 26 16:02:44 UTC 2016
This is an automated email from the git hooks/post-receive script.
carles_fernandez-guest pushed a commit to branch next
in repository gnss-sdr.
commit 78372ba2e9886ff5dc4f606d673db97d13ef47bf
Author: Carles Fernandez <carles.fernandez at gmail.com>
Date: Mon Mar 28 11:58:01 2016 +0200
adding _mm256_zeroupper() at the end of AVX and AVX2 protokernels
This avoids penalties for state transitions from 256-bit x86-AVX
instructions to x86-SSE instructions
---
.../kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h | 2 ++
.../volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h | 2 ++
.../kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h | 4 ++--
.../volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h | 4 ++--
.../kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h | 11 +++++++----
.../volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h | 2 ++
.../kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h | 4 ++--
7 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
index 1dc13b4..1d1c0be 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
@@ -290,6 +290,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
+ _mm256_zeroupper();
for (i = 0; i < 8; ++i)
{
@@ -365,6 +366,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
+ _mm256_zeroupper();
for (i = 0; i < 8; ++i)
{
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h
index 0480a85..dfdf13d 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h
@@ -369,6 +369,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
}
+ _mm256_zeroupper();
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
@@ -461,6 +462,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
}
+ _mm256_zeroupper();
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h
index 50df6d4..aead40d 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h
@@ -225,7 +225,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
_in_b += 8;
_out += 8;
}
-
+ _mm256_zeroupper();
number = avx2_points * 8;
for(;number < num_points; number++)
{
@@ -279,7 +279,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
_in_b += 8;
_out += 8;
}
-
+ _mm256_zeroupper();
number = avx2_points * 8;
for(;number < num_points; number++)
{
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h
index a4c683c..91b3da8 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h
@@ -930,7 +930,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
-
+ _mm256_zeroupper();
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
@@ -1241,7 +1241,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
}
_out[n_vec] = dotProduct;
}
-
+ _mm256_zeroupper();
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h
index 32532f4..5a007d4 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h
@@ -37,7 +37,8 @@
*
* \b Overview
*
- * Computes the sine and cosine of a vector of floats, providing the output in a complex vector (cosine, sine)
+ * VOLK_GNSSSDR kernel that computes the sine and cosine of a vector
+ * of floats, providing the output in a complex vector (cosine, sine)
*
* <b>Dispatcher Prototype</b>
* \code
@@ -133,12 +134,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
cplxValue = _mm_unpacklo_ps(cosine, sine);
-
_mm_storeu_ps((float*)bPtr, cplxValue);
bPtr += 2;
+
cplxValue = _mm_unpackhi_ps(cosine, sine);
_mm_storeu_ps((float*)bPtr, cplxValue);
bPtr += 2;
+
aPtr += 4;
}
@@ -226,12 +228,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
cplxValue = _mm_unpacklo_ps(cosine, sine);
-
_mm_store_ps((float*)bPtr, cplxValue);
bPtr += 2;
+
cplxValue = _mm_unpackhi_ps(cosine, sine);
_mm_store_ps((float*)bPtr, cplxValue);
bPtr += 2;
+
aPtr += 4;
}
@@ -587,7 +590,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
_in = *in++;
d = (int32_t)floor(_in / TWO_PI + 0.5);
_in -= d * TWO_PI;
- x = (int32_t) ((float) _in * TWO_TO_THE_31_DIV_PI);
+ x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI);
ux = x;
sin_index = ux >> diffbits;
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h
index 0433e0b..ef00e5d 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h
@@ -479,6 +479,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
+ _mm256_zeroupper();
for(unsigned int n = avx_iters * 4; n < num_points; n++)
{
@@ -602,6 +603,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
+ _mm256_zeroupper();
for(unsigned int n = avx_iters * 4; n < num_points; n++)
{
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h
index 276ee45..8fbadea 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h
@@ -588,7 +588,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
-
+ _mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++)
{
@@ -756,7 +756,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
-
+ _mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++)
{
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git
More information about the pkg-hamradio-commits
mailing list