[opencv] 17/251: AVX optimized implementation of haar migrated to separate file
Nobuhiro Iwamatsu
iwamatsu at moszumanska.debian.org
Sun Aug 27 23:27:19 UTC 2017
This is an automated email from the git hooks/post-receive script.
iwamatsu pushed a commit to annotated tag 3.3.0
in repository opencv.
commit 77264dcca9d9377b65fdc66ffa91f8bbf72b36b4
Author: Vitaly Tuzov <terfendail at mediana.jetos.com>
Date: Thu Jun 29 21:35:52 2017 +0300
AVX optimized implementation of haar migrated to separate file
---
modules/objdetect/src/haar.avx.cpp | 369 ++++++++++++++++++++++++++++++++++
modules/objdetect/src/haar.cpp | 394 +++----------------------------------
modules/objdetect/src/haar.hpp | 101 ++++++++++
3 files changed, 492 insertions(+), 372 deletions(-)
diff --git a/modules/objdetect/src/haar.avx.cpp b/modules/objdetect/src/haar.avx.cpp
new file mode 100644
index 0000000..23dddfa
--- /dev/null
+++ b/modules/objdetect/src/haar.avx.cpp
@@ -0,0 +1,369 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// Intel License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* Haar features calculation */
+
+#include "precomp.hpp"
+#include "haar.hpp"
+
+namespace cv_haar_avx
+{
+
+// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
+#if CV_HAAR_USE_AVX
+double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier,
+ double variance_norm_factor, size_t p_offset)
+{
+ int CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 };
+ uchar flags[8] = { 0,0,0,0,0,0,0,0 };
+ CvHidHaarTreeNode* nodes[8];
+ double res = 0;
+ uchar exitConditionFlag = 0;
+ for (;;)
+ {
+ float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
+ nodes[0] = (classifier + 0)->node + idxV[0];
+ nodes[1] = (classifier + 1)->node + idxV[1];
+ nodes[2] = (classifier + 2)->node + idxV[2];
+ nodes[3] = (classifier + 3)->node + idxV[3];
+ nodes[4] = (classifier + 4)->node + idxV[4];
+ nodes[5] = (classifier + 5)->node + idxV[5];
+ nodes[6] = (classifier + 6)->node + idxV[6];
+ nodes[7] = (classifier + 7)->node + idxV[7];
+
+ __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
+
+ t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
+ nodes[6]->threshold,
+ nodes[5]->threshold,
+ nodes[4]->threshold,
+ nodes[3]->threshold,
+ nodes[2]->threshold,
+ nodes[1]->threshold,
+ nodes[0]->threshold));
+
+ __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
+ calc_sumf(nodes[6]->feature.rect[0], p_offset),
+ calc_sumf(nodes[5]->feature.rect[0], p_offset),
+ calc_sumf(nodes[4]->feature.rect[0], p_offset),
+ calc_sumf(nodes[3]->feature.rect[0], p_offset),
+ calc_sumf(nodes[2]->feature.rect[0], p_offset),
+ calc_sumf(nodes[1]->feature.rect[0], p_offset),
+ calc_sumf(nodes[0]->feature.rect[0], p_offset));
+
+ __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
+ nodes[6]->feature.rect[0].weight,
+ nodes[5]->feature.rect[0].weight,
+ nodes[4]->feature.rect[0].weight,
+ nodes[3]->feature.rect[0].weight,
+ nodes[2]->feature.rect[0].weight,
+ nodes[1]->feature.rect[0].weight,
+ nodes[0]->feature.rect[0].weight);
+
+ __m256 sum = _mm256_mul_ps(offset, weight);
+
+ offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
+ calc_sumf(nodes[6]->feature.rect[1], p_offset),
+ calc_sumf(nodes[5]->feature.rect[1], p_offset),
+ calc_sumf(nodes[4]->feature.rect[1], p_offset),
+ calc_sumf(nodes[3]->feature.rect[1], p_offset),
+ calc_sumf(nodes[2]->feature.rect[1], p_offset),
+ calc_sumf(nodes[1]->feature.rect[1], p_offset),
+ calc_sumf(nodes[0]->feature.rect[1], p_offset));
+
+ weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
+ nodes[6]->feature.rect[1].weight,
+ nodes[5]->feature.rect[1].weight,
+ nodes[4]->feature.rect[1].weight,
+ nodes[3]->feature.rect[1].weight,
+ nodes[2]->feature.rect[1].weight,
+ nodes[1]->feature.rect[1].weight,
+ nodes[0]->feature.rect[1].weight);
+
+ sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
+
+ if (nodes[0]->feature.rect[2].p0)
+ tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
+ if (nodes[1]->feature.rect[2].p0)
+ tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
+ if (nodes[2]->feature.rect[2].p0)
+ tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
+ if (nodes[3]->feature.rect[2].p0)
+ tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
+ if (nodes[4]->feature.rect[2].p0)
+ tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
+ if (nodes[5]->feature.rect[2].p0)
+ tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
+ if (nodes[6]->feature.rect[2].p0)
+ tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
+ if (nodes[7]->feature.rect[2].p0)
+ tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
+
+ sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
+
+ __m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
+ static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
+ static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
+ static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
+ __m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right), static_cast<float>(nodes[6]->right),
+ static_cast<float>(nodes[5]->right), static_cast<float>(nodes[4]->right),
+ static_cast<float>(nodes[3]->right), static_cast<float>(nodes[2]->right),
+ static_cast<float>(nodes[1]->right), static_cast<float>(nodes[0]->right));
+
+ _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
+
+ for (int i = 0; i < 8; i++)
+ {
+ if (idxV[i] <= 0)
+ {
+ if (!flags[i])
+ {
+ exitConditionFlag++;
+ flags[i] = 1;
+ res += (classifier + i)->alpha[-idxV[i]];
+ }
+ idxV[i] = 0;
+ }
+ }
+ if (exitConditionFlag == 8)
+ return res;
+ }
+}
+
+double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier,
+ double variance_norm_factor, size_t p_offset)
+{
+ float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
+ CvHidHaarTreeNode* nodes[8];
+
+ nodes[0] = classifier[0].node;
+ nodes[1] = classifier[1].node;
+ nodes[2] = classifier[2].node;
+ nodes[3] = classifier[3].node;
+ nodes[4] = classifier[4].node;
+ nodes[5] = classifier[5].node;
+ nodes[6] = classifier[6].node;
+ nodes[7] = classifier[7].node;
+
+ __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
+
+ t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
+ nodes[6]->threshold,
+ nodes[5]->threshold,
+ nodes[4]->threshold,
+ nodes[3]->threshold,
+ nodes[2]->threshold,
+ nodes[1]->threshold,
+ nodes[0]->threshold));
+
+ __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
+ calc_sumf(nodes[6]->feature.rect[0], p_offset),
+ calc_sumf(nodes[5]->feature.rect[0], p_offset),
+ calc_sumf(nodes[4]->feature.rect[0], p_offset),
+ calc_sumf(nodes[3]->feature.rect[0], p_offset),
+ calc_sumf(nodes[2]->feature.rect[0], p_offset),
+ calc_sumf(nodes[1]->feature.rect[0], p_offset),
+ calc_sumf(nodes[0]->feature.rect[0], p_offset));
+
+ __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
+ nodes[6]->feature.rect[0].weight,
+ nodes[5]->feature.rect[0].weight,
+ nodes[4]->feature.rect[0].weight,
+ nodes[3]->feature.rect[0].weight,
+ nodes[2]->feature.rect[0].weight,
+ nodes[1]->feature.rect[0].weight,
+ nodes[0]->feature.rect[0].weight);
+
+ __m256 sum = _mm256_mul_ps(offset, weight);
+
+ offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
+ calc_sumf(nodes[6]->feature.rect[1], p_offset),
+ calc_sumf(nodes[5]->feature.rect[1], p_offset),
+ calc_sumf(nodes[4]->feature.rect[1], p_offset),
+ calc_sumf(nodes[3]->feature.rect[1], p_offset),
+ calc_sumf(nodes[2]->feature.rect[1], p_offset),
+ calc_sumf(nodes[1]->feature.rect[1], p_offset),
+ calc_sumf(nodes[0]->feature.rect[1], p_offset));
+
+ weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
+ nodes[6]->feature.rect[1].weight,
+ nodes[5]->feature.rect[1].weight,
+ nodes[4]->feature.rect[1].weight,
+ nodes[3]->feature.rect[1].weight,
+ nodes[2]->feature.rect[1].weight,
+ nodes[1]->feature.rect[1].weight,
+ nodes[0]->feature.rect[1].weight);
+
+ sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
+
+ if (nodes[0]->feature.rect[2].p0)
+ tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
+ if (nodes[1]->feature.rect[2].p0)
+ tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
+ if (nodes[2]->feature.rect[2].p0)
+ tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
+ if (nodes[3]->feature.rect[2].p0)
+ tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
+ if (nodes[4]->feature.rect[2].p0)
+ tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
+ if (nodes[5]->feature.rect[2].p0)
+ tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
+ if (nodes[6]->feature.rect[2].p0)
+ tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
+ if (nodes[7]->feature.rect[2].p0)
+ tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
+
+ sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
+
+ __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
+ classifier[6].alpha[0],
+ classifier[5].alpha[0],
+ classifier[4].alpha[0],
+ classifier[3].alpha[0],
+ classifier[2].alpha[0],
+ classifier[1].alpha[0],
+ classifier[0].alpha[0]);
+ __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
+ classifier[6].alpha[1],
+ classifier[5].alpha[1],
+ classifier[4].alpha[1],
+ classifier[3].alpha[1],
+ classifier[2].alpha[1],
+ classifier[1].alpha[1],
+ classifier[0].alpha[1]);
+
+ __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ));
+ outBuf = _mm256_hadd_ps(outBuf, outBuf);
+ outBuf = _mm256_hadd_ps(outBuf, outBuf);
+ _mm256_store_ps(tmp, outBuf);
+ return (tmp[0] + tmp[4]);
+}
+
+double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier,
+ double variance_norm_factor, size_t p_offset)
+{
+ float CV_DECL_ALIGNED(32) buf[8];
+ CvHidHaarTreeNode* nodes[8];
+ nodes[0] = classifier[0].node;
+ nodes[1] = classifier[1].node;
+ nodes[2] = classifier[2].node;
+ nodes[3] = classifier[3].node;
+ nodes[4] = classifier[4].node;
+ nodes[5] = classifier[5].node;
+ nodes[6] = classifier[6].node;
+ nodes[7] = classifier[7].node;
+
+ __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
+ t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
+ nodes[6]->threshold,
+ nodes[5]->threshold,
+ nodes[4]->threshold,
+ nodes[3]->threshold,
+ nodes[2]->threshold,
+ nodes[1]->threshold,
+ nodes[0]->threshold));
+
+ __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
+ calc_sumf(nodes[6]->feature.rect[0], p_offset),
+ calc_sumf(nodes[5]->feature.rect[0], p_offset),
+ calc_sumf(nodes[4]->feature.rect[0], p_offset),
+ calc_sumf(nodes[3]->feature.rect[0], p_offset),
+ calc_sumf(nodes[2]->feature.rect[0], p_offset),
+ calc_sumf(nodes[1]->feature.rect[0], p_offset),
+ calc_sumf(nodes[0]->feature.rect[0], p_offset));
+
+ __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
+ nodes[6]->feature.rect[0].weight,
+ nodes[5]->feature.rect[0].weight,
+ nodes[4]->feature.rect[0].weight,
+ nodes[3]->feature.rect[0].weight,
+ nodes[2]->feature.rect[0].weight,
+ nodes[1]->feature.rect[0].weight,
+ nodes[0]->feature.rect[0].weight);
+
+ __m256 sum = _mm256_mul_ps(offset, weight);
+
+ offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
+ calc_sumf(nodes[6]->feature.rect[1], p_offset),
+ calc_sumf(nodes[5]->feature.rect[1], p_offset),
+ calc_sumf(nodes[4]->feature.rect[1], p_offset),
+ calc_sumf(nodes[3]->feature.rect[1], p_offset),
+ calc_sumf(nodes[2]->feature.rect[1], p_offset),
+ calc_sumf(nodes[1]->feature.rect[1], p_offset),
+ calc_sumf(nodes[0]->feature.rect[1], p_offset));
+
+ weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
+ nodes[6]->feature.rect[1].weight,
+ nodes[5]->feature.rect[1].weight,
+ nodes[4]->feature.rect[1].weight,
+ nodes[3]->feature.rect[1].weight,
+ nodes[2]->feature.rect[1].weight,
+ nodes[1]->feature.rect[1].weight,
+ nodes[0]->feature.rect[1].weight);
+
+ sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
+
+ __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
+ classifier[6].alpha[0],
+ classifier[5].alpha[0],
+ classifier[4].alpha[0],
+ classifier[3].alpha[0],
+ classifier[2].alpha[0],
+ classifier[1].alpha[0],
+ classifier[0].alpha[0]);
+ __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
+ classifier[6].alpha[1],
+ classifier[5].alpha[1],
+ classifier[4].alpha[1],
+ classifier[3].alpha[1],
+ classifier[2].alpha[1],
+ classifier[1].alpha[1],
+ classifier[0].alpha[1]);
+
+ _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
+ return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]);
+}
+
+#endif //CV_HAAR_USE_AVX
+
+}
+
+/* End of file. */
diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp
index c66d4de..5455e88 100644
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@@ -45,6 +45,10 @@
#include "opencv2/imgproc/imgproc_c.h"
#include "opencv2/objdetect/objdetect_c.h"
#include <stdio.h>
+#include "haar.hpp"
+#if CV_HAAR_FEATURE_MAX_LOCAL != CV_HAAR_FEATURE_MAX
+ #error CV_HAAR_FEATURE_MAX definition changed. Adjust CV_HAAR_FEATURE_MAX_LOCAL value please.
+#endif
#if CV_SSE2
# if 1 /*!CV_SSE4_1 && !CV_SSE4_2*/
@@ -53,8 +57,7 @@
# endif
#endif
-#if 0 /*CV_AVX*/
-# define CV_HAAR_USE_AVX 1
+#if CV_HAAR_USE_AVX
# if defined _MSC_VER
# pragma warning( disable : 4752 )
# endif
@@ -68,38 +71,6 @@
#define CV_ADJUST_FEATURES 1
#define CV_ADJUST_WEIGHTS 0
-typedef int sumtype;
-typedef double sqsumtype;
-
-typedef struct CvHidHaarFeature
-{
- struct
- {
- sumtype *p0, *p1, *p2, *p3;
- float weight;
- }
- rect[CV_HAAR_FEATURE_MAX];
-} CvHidHaarFeature;
-
-
-typedef struct CvHidHaarTreeNode
-{
- CvHidHaarFeature feature;
- float threshold;
- int left;
- int right;
-} CvHidHaarTreeNode;
-
-
-typedef struct CvHidHaarClassifier
-{
- int count;
- //CvHaarFeature* orig_feature;
- CvHidHaarTreeNode* node;
- float* alpha;
-} CvHidHaarClassifier;
-
-
typedef struct CvHidHaarStageClassifier
{
int count;
@@ -420,10 +391,6 @@ icvCreateHidHaarClassifierCascade( CvHaarClassifierCascade* cascade )
#define calc_sum(rect,offset) \
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
-#define calc_sumf(rect,offset) \
- static_cast<float>((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
-
-
CV_IMPL void
cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
const CvArr* _sum,
@@ -640,129 +607,6 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
}
-// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
-#ifdef CV_HAAR_USE_AVX
-CV_INLINE
-double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
- double variance_norm_factor, size_t p_offset )
-{
- int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
- uchar flags[8] = {0,0,0,0,0,0,0,0};
- CvHidHaarTreeNode* nodes[8];
- double res = 0;
- uchar exitConditionFlag = 0;
- for(;;)
- {
- float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
- nodes[0] = (classifier+0)->node + idxV[0];
- nodes[1] = (classifier+1)->node + idxV[1];
- nodes[2] = (classifier+2)->node + idxV[2];
- nodes[3] = (classifier+3)->node + idxV[3];
- nodes[4] = (classifier+4)->node + idxV[4];
- nodes[5] = (classifier+5)->node + idxV[5];
- nodes[6] = (classifier+6)->node + idxV[6];
- nodes[7] = (classifier+7)->node + idxV[7];
-
- __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
-
- t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
- nodes[6]->threshold,
- nodes[5]->threshold,
- nodes[4]->threshold,
- nodes[3]->threshold,
- nodes[2]->threshold,
- nodes[1]->threshold,
- nodes[0]->threshold));
-
- __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
- calc_sumf(nodes[6]->feature.rect[0], p_offset),
- calc_sumf(nodes[5]->feature.rect[0], p_offset),
- calc_sumf(nodes[4]->feature.rect[0], p_offset),
- calc_sumf(nodes[3]->feature.rect[0], p_offset),
- calc_sumf(nodes[2]->feature.rect[0], p_offset),
- calc_sumf(nodes[1]->feature.rect[0], p_offset),
- calc_sumf(nodes[0]->feature.rect[0], p_offset));
-
- __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
- nodes[6]->feature.rect[0].weight,
- nodes[5]->feature.rect[0].weight,
- nodes[4]->feature.rect[0].weight,
- nodes[3]->feature.rect[0].weight,
- nodes[2]->feature.rect[0].weight,
- nodes[1]->feature.rect[0].weight,
- nodes[0]->feature.rect[0].weight);
-
- __m256 sum = _mm256_mul_ps(offset, weight);
-
- offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
- calc_sumf(nodes[6]->feature.rect[1], p_offset),
- calc_sumf(nodes[5]->feature.rect[1], p_offset),
- calc_sumf(nodes[4]->feature.rect[1], p_offset),
- calc_sumf(nodes[3]->feature.rect[1], p_offset),
- calc_sumf(nodes[2]->feature.rect[1], p_offset),
- calc_sumf(nodes[1]->feature.rect[1], p_offset),
- calc_sumf(nodes[0]->feature.rect[1], p_offset));
-
- weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
- nodes[6]->feature.rect[1].weight,
- nodes[5]->feature.rect[1].weight,
- nodes[4]->feature.rect[1].weight,
- nodes[3]->feature.rect[1].weight,
- nodes[2]->feature.rect[1].weight,
- nodes[1]->feature.rect[1].weight,
- nodes[0]->feature.rect[1].weight);
-
- sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
-
- if( nodes[0]->feature.rect[2].p0 )
- tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
- if( nodes[1]->feature.rect[2].p0 )
- tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
- if( nodes[2]->feature.rect[2].p0 )
- tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
- if( nodes[3]->feature.rect[2].p0 )
- tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
- if( nodes[4]->feature.rect[2].p0 )
- tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
- if( nodes[5]->feature.rect[2].p0 )
- tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
- if( nodes[6]->feature.rect[2].p0 )
- tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
- if( nodes[7]->feature.rect[2].p0 )
- tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
-
- sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
-
- __m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
- static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
- static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
- static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
- __m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right),static_cast<float>(nodes[6]->right),
- static_cast<float>(nodes[5]->right),static_cast<float>(nodes[4]->right),
- static_cast<float>(nodes[3]->right),static_cast<float>(nodes[2]->right),
- static_cast<float>(nodes[1]->right),static_cast<float>(nodes[0]->right));
-
- _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
-
- for(int i = 0; i < 8; i++)
- {
- if(idxV[i]<=0)
- {
- if(!flags[i])
- {
- exitConditionFlag++;
- flags[i] = 1;
- res += (classifier+i)->alpha[-idxV[i]];
- }
- idxV[i]=0;
- }
- }
- if(exitConditionFlag == 8)
- return res;
- }
-}
-#endif //CV_HAAR_USE_AVX
-
CV_INLINE
double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
double variance_norm_factor,
@@ -823,8 +667,8 @@ static int
cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
CvPoint pt, double& stage_sum, int start_stage )
{
-#ifdef CV_HAAR_USE_AVX
- bool haveAVX = cv::checkHardwareSupport(CV_CPU_AVX);
+#if CV_HAAR_USE_AVX
+ bool haveAVX = CV_CPU_HAS_SUPPORT_AVX;
#else
# ifdef CV_HAAR_USE_SSE
bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
@@ -870,14 +714,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
stage_sum = 0.0;
j = 0;
-#ifdef CV_HAAR_USE_AVX
+#if CV_HAAR_USE_AVX
if(haveAVX)
{
for( ; j <= ptr->count - 8; j += 8 )
{
- stage_sum += icvEvalHidHaarClassifierAVX(
- ptr->classifier + j,
- variance_norm_factor, p_offset );
+ stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX(
+ ptr->classifier + j,
+ variance_norm_factor, p_offset );
}
}
#endif
@@ -901,106 +745,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
}
else if( cascade->isStumpBased )
{
-#ifdef CV_HAAR_USE_AVX
+#if CV_HAAR_USE_AVX
if(haveAVX)
{
- CvHidHaarClassifier* classifiers[8];
- CvHidHaarTreeNode* nodes[8];
for( i = start_stage; i < cascade->count; i++ )
{
stage_sum = 0.0;
j = 0;
- float CV_DECL_ALIGNED(32) buf[8];
if( cascade->stage_classifier[i].two_rects )
{
for( ; j <= cascade->stage_classifier[i].count - 8; j += 8 )
{
- classifiers[0] = cascade->stage_classifier[i].classifier + j;
- nodes[0] = classifiers[0]->node;
- classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
- nodes[1] = classifiers[1]->node;
- classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
- nodes[2] = classifiers[2]->node;
- classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
- nodes[3] = classifiers[3]->node;
- classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
- nodes[4] = classifiers[4]->node;
- classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
- nodes[5] = classifiers[5]->node;
- classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
- nodes[6] = classifiers[6]->node;
- classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
- nodes[7] = classifiers[7]->node;
-
- __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
- t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
- nodes[6]->threshold,
- nodes[5]->threshold,
- nodes[4]->threshold,
- nodes[3]->threshold,
- nodes[2]->threshold,
- nodes[1]->threshold,
- nodes[0]->threshold));
-
- __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
- calc_sumf(nodes[6]->feature.rect[0], p_offset),
- calc_sumf(nodes[5]->feature.rect[0], p_offset),
- calc_sumf(nodes[4]->feature.rect[0], p_offset),
- calc_sumf(nodes[3]->feature.rect[0], p_offset),
- calc_sumf(nodes[2]->feature.rect[0], p_offset),
- calc_sumf(nodes[1]->feature.rect[0], p_offset),
- calc_sumf(nodes[0]->feature.rect[0], p_offset));
-
- __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
- nodes[6]->feature.rect[0].weight,
- nodes[5]->feature.rect[0].weight,
- nodes[4]->feature.rect[0].weight,
- nodes[3]->feature.rect[0].weight,
- nodes[2]->feature.rect[0].weight,
- nodes[1]->feature.rect[0].weight,
- nodes[0]->feature.rect[0].weight);
-
- __m256 sum = _mm256_mul_ps(offset, weight);
-
- offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
- calc_sumf(nodes[6]->feature.rect[1], p_offset),
- calc_sumf(nodes[5]->feature.rect[1], p_offset),
- calc_sumf(nodes[4]->feature.rect[1], p_offset),
- calc_sumf(nodes[3]->feature.rect[1], p_offset),
- calc_sumf(nodes[2]->feature.rect[1], p_offset),
- calc_sumf(nodes[1]->feature.rect[1], p_offset),
- calc_sumf(nodes[0]->feature.rect[1], p_offset));
-
- weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
- nodes[6]->feature.rect[1].weight,
- nodes[5]->feature.rect[1].weight,
- nodes[4]->feature.rect[1].weight,
- nodes[3]->feature.rect[1].weight,
- nodes[2]->feature.rect[1].weight,
- nodes[1]->feature.rect[1].weight,
- nodes[0]->feature.rect[1].weight);
-
- sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
-
- __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
- classifiers[6]->alpha[0],
- classifiers[5]->alpha[0],
- classifiers[4]->alpha[0],
- classifiers[3]->alpha[0],
- classifiers[2]->alpha[0],
- classifiers[1]->alpha[0],
- classifiers[0]->alpha[0]);
- __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
- classifiers[6]->alpha[1],
- classifiers[5]->alpha[1],
- classifiers[4]->alpha[1],
- classifiers[3]->alpha[1],
- classifiers[2]->alpha[1],
- classifiers[1]->alpha[1],
- classifiers[0]->alpha[1]);
-
- _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
- stage_sum += (buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+ stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierTwoRectAVX(
+ cascade->stage_classifier[i].classifier + j,
+ variance_norm_factor, p_offset);
}
for( ; j < cascade->stage_classifier[i].count; j++ )
@@ -1018,117 +776,9 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
{
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
{
- float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
-
- classifiers[0] = cascade->stage_classifier[i].classifier + j;
- nodes[0] = classifiers[0]->node;
- classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
- nodes[1] = classifiers[1]->node;
- classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
- nodes[2] = classifiers[2]->node;
- classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
- nodes[3] = classifiers[3]->node;
- classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
- nodes[4] = classifiers[4]->node;
- classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
- nodes[5] = classifiers[5]->node;
- classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
- nodes[6] = classifiers[6]->node;
- classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
- nodes[7] = classifiers[7]->node;
-
- __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
-
- t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
- nodes[6]->threshold,
- nodes[5]->threshold,
- nodes[4]->threshold,
- nodes[3]->threshold,
- nodes[2]->threshold,
- nodes[1]->threshold,
- nodes[0]->threshold));
-
- __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
- calc_sumf(nodes[6]->feature.rect[0], p_offset),
- calc_sumf(nodes[5]->feature.rect[0], p_offset),
- calc_sumf(nodes[4]->feature.rect[0], p_offset),
- calc_sumf(nodes[3]->feature.rect[0], p_offset),
- calc_sumf(nodes[2]->feature.rect[0], p_offset),
- calc_sumf(nodes[1]->feature.rect[0], p_offset),
- calc_sumf(nodes[0]->feature.rect[0], p_offset));
-
- __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
- nodes[6]->feature.rect[0].weight,
- nodes[5]->feature.rect[0].weight,
- nodes[4]->feature.rect[0].weight,
- nodes[3]->feature.rect[0].weight,
- nodes[2]->feature.rect[0].weight,
- nodes[1]->feature.rect[0].weight,
- nodes[0]->feature.rect[0].weight);
-
- __m256 sum = _mm256_mul_ps(offset, weight);
-
- offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
- calc_sumf(nodes[6]->feature.rect[1], p_offset),
- calc_sumf(nodes[5]->feature.rect[1], p_offset),
- calc_sumf(nodes[4]->feature.rect[1], p_offset),
- calc_sumf(nodes[3]->feature.rect[1], p_offset),
- calc_sumf(nodes[2]->feature.rect[1], p_offset),
- calc_sumf(nodes[1]->feature.rect[1], p_offset),
- calc_sumf(nodes[0]->feature.rect[1], p_offset));
-
- weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
- nodes[6]->feature.rect[1].weight,
- nodes[5]->feature.rect[1].weight,
- nodes[4]->feature.rect[1].weight,
- nodes[3]->feature.rect[1].weight,
- nodes[2]->feature.rect[1].weight,
- nodes[1]->feature.rect[1].weight,
- nodes[0]->feature.rect[1].weight);
-
- sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
-
- if( nodes[0]->feature.rect[2].p0 )
- tmp[0] = calc_sumf(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
- if( nodes[1]->feature.rect[2].p0 )
- tmp[1] = calc_sumf(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
- if( nodes[2]->feature.rect[2].p0 )
- tmp[2] = calc_sumf(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
- if( nodes[3]->feature.rect[2].p0 )
- tmp[3] = calc_sumf(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
- if( nodes[4]->feature.rect[2].p0 )
- tmp[4] = calc_sumf(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
- if( nodes[5]->feature.rect[2].p0 )
- tmp[5] = calc_sumf(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
- if( nodes[6]->feature.rect[2].p0 )
- tmp[6] = calc_sumf(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
- if( nodes[7]->feature.rect[2].p0 )
- tmp[7] = calc_sumf(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
-
- sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
-
- __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
- classifiers[6]->alpha[0],
- classifiers[5]->alpha[0],
- classifiers[4]->alpha[0],
- classifiers[3]->alpha[0],
- classifiers[2]->alpha[0],
- classifiers[1]->alpha[0],
- classifiers[0]->alpha[0]);
- __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
- classifiers[6]->alpha[1],
- classifiers[5]->alpha[1],
- classifiers[4]->alpha[1],
- classifiers[3]->alpha[1],
- classifiers[2]->alpha[1],
- classifiers[1]->alpha[1],
- classifiers[0]->alpha[1]);
-
- __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
- outBuf = _mm256_hadd_ps(outBuf, outBuf);
- outBuf = _mm256_hadd_ps(outBuf, outBuf);
- _mm256_store_ps(buf, outBuf);
- stage_sum += (buf[0] + buf[4]);
+ stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierAVX(
+ cascade->stage_classifier[i].classifier + j,
+ variance_norm_factor, p_offset);
}
for( ; j < cascade->stage_classifier[i].count; j++ )
@@ -1241,14 +891,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
stage_sum = 0.0;
int k = 0;
-#ifdef CV_HAAR_USE_AVX
+#if CV_HAAR_USE_AVX
if(haveAVX)
{
for( ; k < cascade->stage_classifier[i].count - 8; k += 8 )
{
- stage_sum += icvEvalHidHaarClassifierAVX(
- cascade->stage_classifier[i].classifier + k,
- variance_norm_factor, p_offset );
+ stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX(
+ cascade->stage_classifier[i].classifier + k,
+ variance_norm_factor, p_offset );
}
}
#endif
diff --git a/modules/objdetect/src/haar.hpp b/modules/objdetect/src/haar.hpp
new file mode 100644
index 0000000..72a0af4
--- /dev/null
+++ b/modules/objdetect/src/haar.hpp
@@ -0,0 +1,101 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// Intel License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* Haar features calculation */
+
+#ifndef OPENCV_OBJDETECT_HAAR_HPP
+#define OPENCV_OBJDETECT_HAAR_HPP
+
+#define CV_HAAR_FEATURE_MAX_LOCAL 3
+
+typedef int sumtype;
+typedef double sqsumtype;
+
+typedef struct CvHidHaarFeature
+{
+ struct
+ {
+ sumtype *p0, *p1, *p2, *p3;
+ float weight;
+ }
+ rect[CV_HAAR_FEATURE_MAX_LOCAL];
+} CvHidHaarFeature;
+
+
+typedef struct CvHidHaarTreeNode
+{
+ CvHidHaarFeature feature;
+ float threshold;
+ int left;
+ int right;
+} CvHidHaarTreeNode;
+
+
+typedef struct CvHidHaarClassifier
+{
+ int count;
+ //CvHaarFeature* orig_feature;
+ CvHidHaarTreeNode* node;
+ float* alpha;
+} CvHidHaarClassifier;
+
+#define calc_sumf(rect,offset) \
+ static_cast<float>((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
+
+namespace cv_haar_avx
+{
+#if 0 /*CV_TRY_AVX*/
+ #define CV_HAAR_USE_AVX 1
+#else
+ #define CV_HAAR_USE_AVX 0
+#endif
+
+#if CV_HAAR_USE_AVX
+ // AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
+ double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
+ double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
+ double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
+#endif
+}
+
+#endif
+
+/* End of file. */
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/opencv.git
More information about the debian-science-commits
mailing list