[opencv] 170/251: imgproc: fix vectorized code of accumulate
Nobuhiro Iwamatsu
iwamatsu at moszumanska.debian.org
Sun Aug 27 23:27:39 UTC 2017
This is an automated email from the git hooks/post-receive script.
iwamatsu pushed a commit to annotated tag 3.3.0
in repository opencv.
commit caa5e3b4c5c82dac993ba2acacf0413458a7bffd
Author: Alexander Alekhin <alexander.alekhin at intel.com>
Date: Wed Jul 26 13:11:31 2017 +0300
imgproc: fix vectorized code of accumulate
---
modules/imgproc/src/accum.simd.hpp | 197 ++++++++++++++-----------------------
1 file changed, 73 insertions(+), 124 deletions(-)
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
index e2be2c9..7a29447 100644
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -425,9 +425,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
v_uint16x8 v_0 = v_setall_u16(0);
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint8x16 _v_mask = v_load(mask + x);
- v_uint16x8 v_mask, dummy;
- v_expand(_v_mask, v_mask, dummy);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
v_mask = ~(v_mask == v_0);
v_uint16x8 v_src = v_load(src + x);
v_src = v_src & v_mask;
@@ -443,9 +441,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
v_uint16x8 v_0 = v_setall_u16(0);
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint8x16 _v_mask = v_load(mask + x);
- v_uint16x8 v_mask, dummy;
- v_expand(_v_mask, v_mask, dummy);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
v_mask = ~(v_mask == v_0);
v_uint16x8 v_src0, v_src1, v_src2;
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
@@ -491,8 +487,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 v_masku16, dummy0;
- v_expand(v_load(mask + x), v_masku16, dummy0);
+ v_uint16x8 v_masku16 = v_load_expand(mask + x);
v_uint32x4 v_masku320, v_masku321;
v_expand(v_masku16, v_masku320, v_masku321);
v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
@@ -506,8 +501,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 v_masku16, dummy0;
- v_expand(v_load(mask + x), v_masku16, dummy0);
+ v_uint16x8 v_masku16 = v_load_expand(mask + x);
v_uint32x4 v_masku320, v_masku321;
v_expand(v_masku16, v_masku320, v_masku321);
v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
@@ -770,8 +764,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 v_mask, dummy;
- v_expand(v_load(mask + x), v_mask, dummy);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
v_mask = ~(v_mask == v_0);
v_uint16x8 v_src = v_load(src + x);
v_src = v_src & v_mask;
@@ -803,8 +796,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 v_mask, dummy;
- v_expand(v_load(mask + x), v_mask, dummy);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
v_mask = ~(v_mask == v_0);
v_uint16x8 v_src0, v_src1, v_src2;
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
@@ -871,10 +863,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 v_masku16, dummy0;
- v_expand(v_load(mask + x), v_masku16, dummy0);
- v_uint32x4 v_masku32, dummy1;
- v_expand(v_masku16, v_masku32, dummy1);
+ v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
@@ -892,10 +881,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 v_masku16, dummy0;
- v_expand(v_load(mask + x), v_masku16, dummy0);
- v_uint32x4 v_masku32, dummy1;
- v_expand(v_masku16, v_masku32, dummy1);
+ v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
@@ -947,10 +933,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 v_masku16, dummy0;
- v_expand(v_load(mask + x), v_masku16, dummy0);
- v_uint32x4 v_masku32, dummy1;
- v_expand(v_masku16, v_masku32, dummy1);
+ v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
@@ -967,10 +950,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 v_masku16, dummy0;
- v_expand(v_load(mask + x), v_masku16, dummy0);
- v_uint32x4 v_masku32, dummy1;
- v_expand(v_masku16, v_masku32, dummy1);
+ v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
v_expand(v_masku32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
@@ -1157,9 +1137,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
+ v_uint16x8 v_mask16 = v_load_expand(mask + x);
v_uint32x4 v_mask0, v_mask1;
- v_expand(stub, v_mask0, v_mask1);
+ v_expand(v_mask16, v_mask0, v_mask1);
v_mask0 = ~(v_mask0 == v_0);
v_mask1 = ~(v_mask1 == v_0);
v_uint16x8 v_src = v_load(src + x);
@@ -1182,9 +1162,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
{
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
+ v_uint16x8 v_mask16 = v_load_expand(mask + x);
v_uint32x4 v_mask0, v_mask1;
- v_expand(stub, v_mask0, v_mask1);
+ v_expand(v_mask16, v_mask0, v_mask1);
v_mask0 = ~(v_mask0 == v_0);
v_mask1 = ~(v_mask1 == v_0);
@@ -1254,11 +1234,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 v_stub = v_load_expand(mask + x);
- v_uint32x4 v_stub0, v_stub1;
- v_expand(v_stub, v_stub0, v_stub1);
- v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0));
- v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0));
+ v_uint16x8 v_mask16 = v_load_expand(mask + x);
+ v_uint32x4 v_mask_0, v_mask_1;
+ v_expand(v_mask16, v_mask_0, v_mask_1);
+ v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
+ v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
v_float32x4 v_src0 = v_load(src + x);
v_float32x4 v_src1 = v_load(src + x + 4);
v_src0 = v_src0 & v_mask0;
@@ -1274,11 +1254,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 v_stub = v_load_expand(mask + x);
- v_uint32x4 v_stub0, v_stub1;
- v_expand(v_stub, v_stub0, v_stub1);
- v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0));
- v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0));
+ v_uint16x8 v_mask16 = v_load_expand(mask + x);
+ v_uint32x4 v_mask_0, v_mask_1;
+ v_expand(v_mask16, v_mask_0, v_mask_1);
+ v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
+ v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
v_float32x4 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
@@ -1319,9 +1299,7 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
int size = len * cn;
for (; x <= size - cVectorWidth; x += cVectorWidth)
{
- v_uint8x16 v_src = v_load(src + x);
- v_uint16x8 v_int, dummy;
- v_expand(v_src, v_int, dummy);
+ v_uint16x8 v_int = v_load_expand(src + x);
v_uint32x4 v_int0, v_int1;
v_expand(v_int, v_int0, v_int1);
@@ -1353,17 +1331,15 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
}
else
{
- v_uint8x16 v_0 = v_setzero_u8();
+ v_uint16x8 v_0 = v_setzero_u16();
if (cn == 1)
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint8x16 v_mask = v_load(mask + x);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
v_mask = ~(v_mask == v_0);
- v_uint8x16 v_src = v_load(src + x);
- v_src = v_src & v_mask;
- v_uint16x8 v_int, dummy;
- v_expand(v_src, v_int, dummy);
+ v_uint16x8 v_src = v_load_expand(src + x);
+ v_uint16x8 v_int = v_src & v_mask;
v_uint32x4 v_int0, v_int1;
v_expand(v_int, v_int0, v_int1);
@@ -1395,19 +1371,19 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
}
else if (cn == 3)
{
- for (; x <= len - cVectorWidth; x += cVectorWidth)
+ for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth)
{
- v_uint8x16 v_mask = v_load(mask + x);
- v_mask = ~(v_mask == v_0);
v_uint8x16 v_src0, v_src1, v_src2;
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
- v_src0 = v_src0 & v_mask;
- v_src1 = v_src1 & v_mask;
- v_src2 = v_src2 & v_mask;
v_uint16x8 v_int0, v_int1, v_int2, dummy;
v_expand(v_src0, v_int0, dummy);
v_expand(v_src1, v_int1, dummy);
v_expand(v_src2, v_int2, dummy);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
+ v_mask = ~(v_mask == v_0);
+ v_int0 = v_int0 & v_mask;
+ v_int1 = v_int1 & v_mask;
+ v_int2 = v_int2 & v_mask;
v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
v_expand(v_int0, v_int00, v_int01);
@@ -1627,9 +1603,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 v_mask, dummy;
- v_expand(stub, v_mask, dummy);
+ v_uint32x4 v_mask = v_load_expand_q(mask + x);;
v_mask = ~(v_mask == v_0);
v_float32x4 v_src = v_load(src + x);
v_src = v_src & v_reinterpret_as_f32(v_mask);
@@ -1646,9 +1620,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 v_mask, dummy;
- v_expand(stub, v_mask, dummy);
+ v_uint32x4 v_mask = v_load_expand_q(mask + x);
v_mask = ~(v_mask == v_0);
v_float32x4 v_src0, v_src1, v_src2;
@@ -1709,11 +1681,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 stub0, stub1;
- v_expand(stub, stub0, stub1);
+ v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
- v_expand(stub0, v_masku640, v_masku641);
+ v_expand(v_mask32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
v_float64x2 v_src0 = v_load(src + x);
@@ -1731,11 +1701,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 stub0, stub1;
- v_expand(stub, stub0, stub1);
+ v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
- v_expand(stub0, v_masku640, v_masku641);
+ v_expand(v_mask32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
@@ -2059,11 +2027,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 stub0, stub1;
- v_expand(stub, stub0, stub1);
- v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0));
- v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0));
+ v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x);
+ v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4);
+ v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
+ v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
v_store(dst + x, v_load(dst + x) + ((v_load(src1 + x) * v_load(src2 + x)) & v_mask0));
v_store(dst + x + 4, v_load(dst + x + 4) + ((v_load(src1 + x + 4) * v_load(src2 + x + 4)) & v_mask1));
@@ -2073,11 +2040,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 stub0, stub1;
- v_expand(stub, stub0, stub1);
- v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0));
- v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0));
+ v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x);
+ v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4);
+ v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
+ v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
v_float32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
v_float32x4 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@@ -2109,12 +2075,8 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
int size = len * cn;
for (; x <= size - cVectorWidth; x += cVectorWidth)
{
- v_uint8x16 v_1src = v_load(src1 + x);
- v_uint8x16 v_2src = v_load(src2 + x);
-
- v_uint16x8 v_1int, v_2int, dummy;
- v_expand(v_1src, v_1int, dummy);
- v_expand(v_2src, v_2int, dummy);
+ v_uint16x8 v_1int = v_load_expand(src1 + x);
+ v_uint16x8 v_2int = v_load_expand(src2 + x);
v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
v_expand(v_1int, v_1int_0, v_1int_1);
@@ -2148,19 +2110,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
}
else
{
- v_uint8x16 v_0 = v_setzero_u8();
+ v_uint16x8 v_0 = v_setzero_u16();
if (cn == 1)
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint8x16 v_mask = v_load(mask + x);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
v_mask = ~(v_mask == v_0);
- v_uint8x16 v_1src = v_load(src1 + x) & v_mask;
- v_uint8x16 v_2src = v_load(src2 + x) & v_mask;
-
- v_uint16x8 v_1int, v_2int, dummy;
- v_expand(v_1src, v_1int, dummy);
- v_expand(v_2src, v_2int, dummy);
+ v_uint16x8 v_1int = v_load_expand(src1 + x) & v_mask;
+ v_uint16x8 v_2int = v_load_expand(src2 + x) & v_mask;
v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
v_expand(v_1int, v_1int_0, v_1int_1);
@@ -2194,19 +2152,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
}
else if (cn == 3)
{
- for (; x <= len - cVectorWidth; x += cVectorWidth)
+ for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth)
{
- v_uint8x16 v_mask = v_load(mask + x);
- v_mask = ~(v_mask == v_0);
v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
- v_1src0 = v_1src0 & v_mask;
- v_1src1 = v_1src1 & v_mask;
- v_1src2 = v_1src2 & v_mask;
- v_2src0 = v_2src0 & v_mask;
- v_2src1 = v_2src1 & v_mask;
- v_2src2 = v_2src2 & v_mask;
v_uint16x8 v_1int0, v_1int1, v_1int2, v_2int0, v_2int1, v_2int2, dummy;
v_expand(v_1src0, v_1int0, dummy);
@@ -2216,6 +2166,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
v_expand(v_2src1, v_2int1, dummy);
v_expand(v_2src2, v_2int2, dummy);
+ v_uint16x8 v_mask = v_load_expand(mask + x);
+ v_mask = ~(v_mask == v_0);
+ v_1int0 = v_1int0 & v_mask;
+ v_1int1 = v_1int1 & v_mask;
+ v_1int2 = v_1int2 & v_mask;
+ v_2int0 = v_2int0 & v_mask;
+ v_2int1 = v_2int1 & v_mask;
+ v_2int2 = v_2int2 & v_mask;
+
v_uint32x4 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
v_uint32x4 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
v_expand(v_1int0, v_1int00, v_1int01);
@@ -2440,9 +2399,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 v_mask, dummy;
- v_expand(stub, v_mask, dummy);
+ v_uint32x4 v_mask = v_load_expand_q(mask + x);
v_mask = ~(v_mask == v_0);
v_float32x4 v_1src = v_load(src1 + x);
v_float32x4 v_2src = v_load(src2 + x);
@@ -2462,9 +2419,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 v_mask, dummy;
- v_expand(stub, v_mask, dummy);
+ v_uint32x4 v_mask = v_load_expand_q(mask + x);
v_mask = ~(v_mask == v_0);
v_float32x4 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
@@ -2522,11 +2477,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 stub0, stub1;
- v_expand(stub, stub0, stub1);
+ v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
- v_expand(stub0, v_masku640, v_masku641);
+ v_expand(v_mask32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
@@ -2543,11 +2496,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
{
for (; x <= len - cVectorWidth; x += cVectorWidth)
{
- v_uint16x8 stub = v_load_expand(mask + x);
- v_uint32x4 stub0, stub1;
- v_expand(stub, stub0, stub1);
+ v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
v_uint64x2 v_masku640, v_masku641;
- v_expand(stub0, v_masku640, v_masku641);
+ v_expand(v_mask32, v_masku640, v_masku641);
v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
@@ -2704,12 +2655,10 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c
int size = len * cn;
for (; x <= size - cVectorWidth; x += cVectorWidth)
{
- v_uint8x16 v_src = v_load(src + x);
- v_uint16x8 v_int, dummy;
- v_expand(v_src, v_int, dummy);
+ v_uint16x8 v_src16 = v_load_expand(src + x);
v_uint32x4 v_int_0, v_int_1;
- v_expand(v_int, v_int_0, v_int_1);
+ v_expand(v_src16, v_int_0, v_int_1);
v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/opencv.git
More information about the debian-science-commits
mailing list