[arrayfire] 88/284: moved bilateral, convolve, fftconvolve to cpu kernel namespace
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Sun Feb 7 18:59:22 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.
commit d03bb75f24481953484443cbed46df62264a5008
Author: pradeep <pradeep at arrayfire.com>
Date: Fri Dec 18 17:09:11 2015 -0500
moved bilateral, convolve, fftconvolve to cpu kernel namespace
---
src/backend/cpu/bilateral.cpp | 77 +------
src/backend/cpu/convolve.cpp | 247 +--------------------
src/backend/cpu/fftconvolve.cpp | 236 ++------------------
src/backend/cpu/kernel/approx1.hpp | 1 +
.../cpu/{bilateral.cpp => kernel/bilateral.hpp} | 50 +----
.../cpu/{convolve.cpp => kernel/convolve.hpp} | 164 ++++----------
src/backend/cpu/kernel/fftconvolve.hpp | 227 +++++++++++++++++++
7 files changed, 308 insertions(+), 694 deletions(-)
diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp
index ea38ea7..c751f99 100644
--- a/src/backend/cpu/bilateral.cpp
+++ b/src/backend/cpu/bilateral.cpp
@@ -11,6 +11,7 @@
#include <af/defines.h>
#include <ArrayInfo.hpp>
#include <Array.hpp>
+#include <kernel/bilateral.hpp>
#include <bilateral.hpp>
#include <cmath>
#include <algorithm>
@@ -22,87 +23,13 @@ using af::dim4;
namespace cpu
{
-static inline dim_t clamp(int a, dim_t mn, dim_t mx)
-{
- return (a < (int)mn ? mn : (a > (int)mx ? mx : a));
-}
-
-static inline unsigned getIdx(const dim4 &strides,
- int i, int j = 0, int k = 0, int l = 0)
-{
- return (l * strides[3] +
- k * strides[2] +
- j * strides[1] +
- i * strides[0]);
-}
-
-template<typename inType, typename outType, bool isColor>
-void bilateral_(Array<outType> out, const Array<inType> in, float s_sigma, float c_sigma)
-{
- const dim4 dims = in.dims();
- const dim4 istrides = in.strides();
-
- const dim4 ostrides = out.strides();
-
- outType *outData = out.get();
- const inType *inData = in.get();
-
- // clamp spatical and chromatic sigma's
- float space_ = std::min(11.5f, std::max(s_sigma, 0.f));
- float color_ = std::max(c_sigma, 0.f);
- const dim_t radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1);
- const float svar = space_*space_;
- const float cvar = color_*color_;
-
- for(dim_t b3=0; b3<dims[3]; ++b3) {
- // b3 for loop handles following batch configurations
- // - gfor
- // - input based batch
- // - when input is 4d array for color images
- for(dim_t b2=0; b2<dims[2]; ++b2) {
- // b2 for loop handles following batch configurations
- // - channels
- // - input based batch
- // - when input is 3d array for grayscale images
- for(dim_t j=0; j<dims[1]; ++j) {
- // j steps along 2nd dimension
- for(dim_t i=0; i<dims[0]; ++i) {
- // i steps along 1st dimension
- outType norm = 0.0;
- outType res = 0.0;
- const outType center = (outType)inData[getIdx(istrides, i, j)];
- for(dim_t wj=-radius; wj<=radius; ++wj) {
- // clamps offsets
- dim_t tj = clamp(j+wj, 0, dims[1]-1);
- for(dim_t wi=-radius; wi<=radius; ++wi) {
- // clamps offsets
- dim_t ti = clamp(i+wi, 0, dims[0]-1);
- // proceed
- const outType val= (outType)inData[getIdx(istrides, ti, tj)];
- const outType gauss_space = (wi*wi+wj*wj)/(-2.0*svar);
- const outType gauss_range = ((center-val)*(center-val))/(-2.0*cvar);
- const outType weight = std::exp(gauss_space+gauss_range);
- norm += weight;
- res += val*weight;
- }
- } // filter loop ends here
-
- outData[getIdx(ostrides, i, j)] = res/norm;
- } //1st dimension loop ends here
- } //2nd dimension loop ends here
- outData += ostrides[2];
- inData += istrides[2];
- }
- }
-}
-
template<typename inType, typename outType, bool isColor>
Array<outType> bilateral(const Array<inType> &in, const float &s_sigma, const float &c_sigma)
{
in.eval();
const dim4 dims = in.dims();
Array<outType> out = createEmptyArray<outType>(dims);
- getQueue().enqueue(bilateral_<inType, outType, isColor>, out, in, s_sigma, c_sigma);
+ getQueue().enqueue(kernel::bilateral<inType, outType, isColor>, out, in, s_sigma, c_sigma);
return out;
}
diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp
index 239b4f0..218ba8e 100644
--- a/src/backend/cpu/convolve.cpp
+++ b/src/backend/cpu/convolve.cpp
@@ -16,170 +16,13 @@
#include <math.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/convolve.hpp>
using af::dim4;
namespace cpu
{
-template<typename T, typename accT, bool expand>
-void one2one_1d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
- dim4 const &sDims, dim4 const &fDims, dim4 const &sStrides)
-{
- dim_t start = (expand ? 0 : fDims[0]/2);
- dim_t end = (expand ? oDims[0] : start + sDims[0]);
- for(dim_t i=start; i<end; ++i) {
- accT accum = 0.0;
- for(dim_t f=0; f<fDims[0]; ++f) {
- dim_t iIdx = i-f;
- T s_val = ((iIdx>=0 &&iIdx<sDims[0])? iptr[iIdx*sStrides[0]] : T(0));
- accum += accT(s_val * fptr[f]);
- }
- optr[i-start] = T(accum);
- }
-}
-
-template<typename T, typename accT, bool expand>
-void one2one_2d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
- dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides,
- dim4 const &sStrides, dim4 const &fStrides)
-{
- dim_t jStart = (expand ? 0 : fDims[1]/2);
- dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]);
- dim_t iStart = (expand ? 0 : fDims[0]/2);
- dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]);
-
- for(dim_t j=jStart; j<jEnd; ++j) {
- dim_t joff = (j-jStart)*oStrides[1];
-
- for(dim_t i=iStart; i<iEnd; ++i) {
-
- accT accum = accT(0);
- for(dim_t wj=0; wj<fDims[1]; ++wj) {
- dim_t jIdx = j-wj;
- dim_t w_joff = wj*fStrides[1];
- dim_t s_joff = jIdx * sStrides[1];
- bool isJValid = (jIdx>=0 && jIdx<sDims[1]);
-
- for(dim_t wi=0; wi<fDims[0]; ++wi) {
- dim_t iIdx = i-wi;
-
- T s_val = T(0);
- if ( isJValid && (iIdx>=0 && iIdx<sDims[0])) {
- s_val = iptr[s_joff+iIdx*sStrides[0]];
- }
-
- accum += accT(s_val * fptr[w_joff+wi*fStrides[0]]);
- }
- }
- optr[joff+i-iStart] = T(accum);
- }
- }
-}
-
-template<typename T, typename accT, bool expand>
-void one2one_3d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
- dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides,
- dim4 const &sStrides, dim4 const &fStrides)
-{
- dim_t kStart = (expand ? 0 : fDims[2]/2);
- dim_t kEnd = (expand ? oDims[2] : kStart + sDims[2]);
- dim_t jStart = (expand ? 0 : fDims[1]/2);
- dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]);
- dim_t iStart = (expand ? 0 : fDims[0]/2);
- dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]);
-
- for(dim_t k=kStart; k<kEnd; ++k) {
- dim_t koff = (k-kStart)*oStrides[2];
-
- for(dim_t j=jStart; j<jEnd; ++j) {
- dim_t joff = (j-jStart)*oStrides[1];
-
- for(dim_t i=iStart; i<iEnd; ++i) {
-
- accT accum = accT(0);
- for(dim_t wk=0; wk<fDims[2]; ++wk) {
- dim_t kIdx = k-wk;
- dim_t w_koff = wk*fStrides[2];
- dim_t s_koff = kIdx * sStrides[2];
- bool isKValid = (kIdx>=0 && kIdx<sDims[2]);
-
- for(dim_t wj=0; wj<fDims[1]; ++wj) {
- dim_t jIdx = j-wj;
- dim_t w_joff = wj*fStrides[1];
- dim_t s_joff = jIdx * sStrides[1];
- bool isJValid = (jIdx>=0 && jIdx<sDims[1]);
-
- for(dim_t wi=0; wi<fDims[0]; ++wi) {
- dim_t iIdx = i-wi;
-
- T s_val = T(0);
- if ( isKValid && isJValid && (iIdx>=0 && iIdx<sDims[0])) {
- s_val = iptr[s_koff+s_joff+iIdx*sStrides[0]];
- }
-
- accum += accT(s_val * fptr[w_koff+w_joff+wi*fStrides[0]]);
- }
- }
- }
- optr[koff+joff+i-iStart] = T(accum);
- } //i loop ends here
- } // j loop ends here
- } // k loop ends here
-}
-
-template<typename T, typename accT, dim_t baseDim, bool expand>
-void convolve_nd(T *optr, T const *iptr, accT const *fptr,
- dim4 const &oDims, dim4 const &sDims, dim4 const &fDims,
- dim4 const &oStrides, dim4 const &sStrides, dim4 const &fStrides,
- ConvolveBatchKind kind)
-{
- dim_t out_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
- dim_t in_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
- dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
- dim_t batch[4] = {0, 1, 1, 1}; /* first value is never used, and declared for code simplicity */
-
- for (dim_t i=1; i<4; ++i) {
- switch(kind) {
- case CONVOLVE_BATCH_SIGNAL:
- out_step[i] = oStrides[i];
- in_step[i] = sStrides[i];
- if (i>=baseDim) batch[i] = sDims[i];
- break;
- case CONVOLVE_BATCH_SAME:
- out_step[i] = oStrides[i];
- in_step[i] = sStrides[i];
- filt_step[i] = fStrides[i];
- if (i>=baseDim) batch[i] = sDims[i];
- break;
- case CONVOLVE_BATCH_KERNEL:
- out_step[i] = oStrides[i];
- filt_step[i] = fStrides[i];
- if (i>=baseDim) batch[i] = fDims[i];
- break;
- default:
- break;
- }
- }
-
- for (dim_t b3=0; b3<batch[3]; ++b3) {
- for (dim_t b2=0; b2<batch[2]; ++b2) {
- for (dim_t b1=0; b1<batch[1]; ++b1) {
-
- T * out = optr + b1 * out_step[1] + b2 * out_step[2] + b3 * out_step[3];
- T const *in = iptr + b1 * in_step[1] + b2 * in_step[2] + b3 * in_step[3];
- accT const *filt = fptr + b1 *filt_step[1] + b2 *filt_step[2] + b3 *filt_step[3];
-
- switch(baseDim) {
- case 1: one2one_1d<T, accT, expand>(out, in, filt, oDims, sDims, fDims, sStrides); break;
- case 2: one2one_2d<T, accT, expand>(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break;
- case 3: one2one_3d<T, accT, expand>(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break;
- }
- }
- }
- }
-}
-
template<typename T, typename accT, dim_t baseDim, bool expand>
Array<T> convolve(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind)
{
@@ -188,7 +31,6 @@ Array<T> convolve(Array<T> const& signal, Array<accT> const& filter, ConvolveBat
auto sDims = signal.dims();
auto fDims = filter.dims();
- auto sStrides = signal.strides();
dim4 oDims(1);
if (expand) {
@@ -209,52 +51,11 @@ Array<T> convolve(Array<T> const& signal, Array<accT> const& filter, ConvolveBat
Array<T> out = createEmptyArray<T>(oDims);
- getQueue().enqueue(convolve_nd<T, accT, baseDim, expand>,out.get(), signal.get(), filter.get(),
- oDims, sDims, fDims, out.strides(), sStrides, filter.strides(), kind);
+ getQueue().enqueue(kernel::convolve_nd<T, accT, baseDim, expand>,out, signal, filter, kind);
return out;
}
-template<typename T, typename accT, dim_t conv_dim, bool expand>
-void convolve2_separable(T *optr, T const *iptr, accT const *fptr,
- dim4 const &oDims, dim4 const &sDims, dim4 const &orgDims, dim_t fDim,
- dim4 const &oStrides, dim4 const &sStrides, dim_t fStride)
-{
- for(dim_t j=0; j<oDims[1]; ++j) {
-
- dim_t jOff = j*oStrides[1];
- dim_t cj = j + (conv_dim==1)*(expand ? 0: fDim>>1);
-
- for(dim_t i=0; i<oDims[0]; ++i) {
-
- dim_t iOff = i*oStrides[0];
- dim_t ci = i + (conv_dim==0)*(expand ? 0 : fDim>>1);
-
- accT accum = scalar<accT>(0);
-
- for(dim_t f=0; f<fDim; ++f) {
- T f_val = fptr[f];
- T s_val;
-
- if (conv_dim==0) {
- dim_t offi = ci - f;
- bool isCIValid = offi>=0 && offi<sDims[0];
- bool isCJValid = cj>=0 && cj<sDims[1];
- s_val = (isCJValid && isCIValid ? iptr[cj*sDims[0]+offi] : scalar<T>(0));
- } else {
- dim_t offj = cj - f;
- bool isCIValid = ci>=0 && ci<sDims[0];
- bool isCJValid = offj>=0 && offj<sDims[1];
- s_val = (isCJValid && isCIValid ? iptr[offj*sDims[0]+ci] : scalar<T>(0));
- }
-
- accum += accT(s_val * f_val);
- }
- optr[iOff+jOff] = T(accum);
- }
- }
-}
-
template<typename T, typename accT, bool expand>
Array<T> convolve2(Array<T> const& signal, Array<accT> const& c_filter, Array<accT> const& r_filter)
{
@@ -262,18 +63,16 @@ Array<T> convolve2(Array<T> const& signal, Array<accT> const& c_filter, Array<ac
c_filter.eval();
r_filter.eval();
- auto sDims = signal.dims();
- auto cfDims = c_filter.dims();
- auto rfDims = r_filter.dims();
- auto sStrides = signal.strides();
-
- dim_t cflen = (dim_t)cfDims.elements();
- dim_t rflen = (dim_t)rfDims.elements();
-
+ auto sDims = signal.dims();
dim4 tDims = sDims;
dim4 oDims = sDims;
if (expand) {
+ auto cfDims = c_filter.dims();
+ auto rfDims = r_filter.dims();
+
+ dim_t cflen = (dim_t)cfDims.elements();
+ dim_t rflen = (dim_t)rfDims.elements();
// separable convolve only does CONVOLVE_BATCH_NONE and standard batch(CONVOLVE_BATCH_SIGNAL)
tDims[0] += cflen - 1;
oDims[0] += cflen - 1;
@@ -282,35 +81,7 @@ Array<T> convolve2(Array<T> const& signal, Array<accT> const& c_filter, Array<ac
Array<T> out = createEmptyArray<T>(oDims);
- auto func = [=] (Array<T> out) {
- Array<T> temp = createEmptyArray<T>(tDims);
- auto tStrides = temp.strides();
- auto oStrides = out.strides();
-
- for (dim_t b3=0; b3<oDims[3]; ++b3) {
-
- dim_t i_b3Off = b3*sStrides[3];
- dim_t t_b3Off = b3*tStrides[3];
- dim_t o_b3Off = b3*oStrides[3];
-
- for (dim_t b2=0; b2<oDims[2]; ++b2) {
-
- T const *iptr = signal.get()+ b2*sStrides[2] + i_b3Off;
- T *tptr = temp.get() + b2*tStrides[2] + t_b3Off;
- T *optr = out.get() + b2*oStrides[2] + o_b3Off;
-
- convolve2_separable<T, accT, 0, expand>(tptr, iptr, c_filter.get(),
- tDims, sDims, sDims, cflen,
- tStrides, sStrides, c_filter.strides()[0]);
-
- convolve2_separable<T, accT, 1, expand>(optr, tptr, r_filter.get(),
- oDims, tDims, sDims, rflen,
- oStrides, tStrides, r_filter.strides()[0]);
- }
- }
- };
-
- getQueue().enqueue(func, out);
+ getQueue().enqueue(kernel::convolve2<T, accT, expand>, out, signal, c_filter, r_filter, tDims);
return out;
}
diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp
index 6172af8..2678c7b 100644
--- a/src/backend/cpu/fftconvolve.cpp
+++ b/src/backend/cpu/fftconvolve.cpp
@@ -19,216 +19,11 @@
#include <convolve_common.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/fftconvolve.hpp>
namespace cpu
{
-template<typename To, typename Ti>
-void packData(Array<To> out, const af::dim4 od, const af::dim4 os, Array<Ti> const in)
-{
- To* out_ptr = out.get();
-
- const af::dim4 id = in.dims();
- const af::dim4 is = in.strides();
- const Ti* in_ptr = in.get();
-
- int id0_half = divup(id[0], 2);
- bool odd_id0 = (id[0] % 2 == 1);
-
- for (int d3 = 0; d3 < (int)od[3]; d3++) {
- for (int d2 = 0; d2 < (int)od[2]; d2++) {
- for (int d1 = 0; d1 < (int)od[1]; d1++) {
- for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
- const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
-
- if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
- const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
- out_ptr[oidx] = (To)in_ptr[iidx];
- if (d0 == id0_half-1 && odd_id0)
- out_ptr[oidx+1] = (To)0;
- else
- out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half];
- }
- else {
- // Pad remaining elements with 0s
- out_ptr[oidx] = (To)0;
- out_ptr[oidx+1] = (To)0;
- }
- }
- }
- }
- }
-}
-
-template<typename To, typename Ti>
-void padArray_(Array<To> out, const af::dim4 od, const af::dim4 os,
- Array<Ti> const in, const dim_t offset)
-{
- To* out_ptr = out.get() + offset;
- const af::dim4 id = in.dims();
- const af::dim4 is = in.strides();
- const Ti* in_ptr = in.get();
-
- for (int d3 = 0; d3 < (int)od[3]; d3++) {
- for (int d2 = 0; d2 < (int)od[2]; d2++) {
- for (int d1 = 0; d1 < (int)od[1]; d1++) {
- for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
- const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
-
- if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
- // Copy input elements to real elements, set imaginary elements to 0
- const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
- out_ptr[oidx] = (To)in_ptr[iidx];
- out_ptr[oidx+1] = (To)0;
- }
- else {
- // Pad remaining of the matrix to 0s
- out_ptr[oidx] = (To)0;
- out_ptr[oidx+1] = (To)0;
- }
- }
- }
- }
- }
-}
-
-template<typename T>
-void complexMultiply(Array<T> packed, const af::dim4 sig_dims, const af::dim4 sig_strides,
- const af::dim4 fit_dims, const af::dim4 fit_strides,
- ConvolveBatchKind kind, const dim_t offset)
-{
- T* out_ptr = packed.get() + (kind==CONVOLVE_BATCH_KERNEL? offset : 0);
- T* in1_ptr = packed.get();
- T* in2_ptr = packed.get() + offset;
-
- const dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims);
- const dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides);
- const dim4& i1d = sig_dims;
- const dim4& i2d = fit_dims;
- const dim4& i1s = sig_strides;
- const dim4& i2s = fit_strides;
-
- for (int d3 = 0; d3 < (int)od[3]; d3++) {
- for (int d2 = 0; d2 < (int)od[2]; d2++) {
- for (int d1 = 0; d1 < (int)od[1]; d1++) {
- for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
- if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) {
- // Complex multiply each signal to equivalent filter
- const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
- const int iidx = ridx + 1;
-
- T a = in1_ptr[ridx];
- T b = in1_ptr[iidx];
- T c = in2_ptr[ridx];
- T d = in2_ptr[iidx];
-
- T ac = a*c;
- T bd = b*d;
-
- out_ptr[ridx] = ac - bd;
- out_ptr[iidx] = (a+b) * (c+d) - ac - bd;
- }
- else if (kind == CONVOLVE_BATCH_SIGNAL) {
- // Complex multiply all signals to filter
- const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
- const int iidx1 = ridx1 + 1;
- const int ridx2 = ridx1 % (i2s[3] * i2d[3]);
- const int iidx2 = iidx1 % (i2s[3] * i2d[3]);
-
- T a = in1_ptr[ridx1];
- T b = in1_ptr[iidx1];
- T c = in2_ptr[ridx2];
- T d = in2_ptr[iidx2];
-
- T ac = a*c;
- T bd = b*d;
-
- out_ptr[ridx1] = ac - bd;
- out_ptr[iidx1] = (a+b) * (c+d) - ac - bd;
- }
- else if (kind == CONVOLVE_BATCH_KERNEL) {
- // Complex multiply signal to all filters
- const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
- const int iidx2 = ridx2 + 1;
- const int ridx1 = ridx2 % (i1s[3] * i1d[3]);
- const int iidx1 = iidx2 % (i1s[3] * i1d[3]);
-
- T a = in1_ptr[ridx1];
- T b = in1_ptr[iidx1];
- T c = in2_ptr[ridx2];
- T d = in2_ptr[iidx2];
-
- T ac = a*c;
- T bd = b*d;
-
- out_ptr[ridx2] = ac - bd;
- out_ptr[iidx2] = (a+b) * (c+d) - ac - bd;
- }
- }
- }
- }
- }
-}
-
-template<typename To, typename Ti, bool roundOut>
-void reorderOutput(To* out_ptr, const af::dim4& od, const af::dim4& os,
- const Ti* in_ptr, const af::dim4& id, const af::dim4& is,
- const af::dim4& fd, const int half_di0, const int baseDim,
- const int fftScale, const bool expand)
-{
- for (int d3 = 0; d3 < (int)od[3]; d3++) {
- for (int d2 = 0; d2 < (int)od[2]; d2++) {
- for (int d1 = 0; d1 < (int)od[1]; d1++) {
- for (int d0 = 0; d0 < (int)od[0]; d0++) {
- int id0, id1, id2, id3;
- if (expand) {
- id0 = d0;
- id1 = d1 * is[1];
- id2 = d2 * is[2];
- id3 = d3 * is[3];
- }
- else {
- id0 = d0 + fd[0]/2;
- id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1];
- id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2];
- id3 = d3 * is[3];
- }
-
- int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0;
-
- // Divide output elements to cuFFT resulting scale, round result if output
- // type is single or double precision floating-point
- if (id0 < half_di0) {
- // Copy top elements
- int iidx = id3 + id2 + id1 + id0 * 2;
- if (roundOut)
- out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
- else
- out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
- }
- else if (id0 < half_di0 + (int)fd[0] - 1) {
- // Add signal and filter elements to central part
- int iidx1 = id3 + id2 + id1 + id0 * 2;
- int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
- if (roundOut)
- out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale));
- else
- out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale);
- }
- else {
- // Copy bottom elements
- const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
- if (roundOut)
- out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
- else
- out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
- }
- }
- }
- }
- }
-}
-
template<typename T, typename convT, typename cT, bool isDouble, bool roundOut, dim_t baseDim>
Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
const bool expand, ConvolveBatchKind kind)
@@ -289,11 +84,11 @@ Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
// Pack signal in a complex matrix where first dimension is half the input
// (allows faster FFT computation) and pad array to a power of 2 with 0s
- getQueue().enqueue(packData<convT, T>, packed, sig_tmp_dims, sig_tmp_strides, signal);
+ getQueue().enqueue(kernel::packData<convT, T>, packed, sig_tmp_dims, sig_tmp_strides, signal);
// Pad filter array with 0s
const dim_t offset = sig_tmp_strides[3]*sig_tmp_dims[3];
- getQueue().enqueue(padArray_<convT, T>, packed, filter_tmp_dims, filter_tmp_strides,
+ getQueue().enqueue(kernel::padArray<convT, T>, packed, filter_tmp_dims, filter_tmp_strides,
filter, offset);
dim4 fftDims(1, 1, 1, 1);
@@ -346,7 +141,7 @@ Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
getQueue().enqueue(upstream_dft, packed, fftDims);
// Multiply filter and signal FFT arrays
- getQueue().enqueue(complexMultiply<convT>, packed,
+ getQueue().enqueue(kernel::complexMultiply<convT>, packed,
sig_tmp_dims, sig_tmp_strides,
filter_tmp_dims, filter_tmp_strides,
kind, offset);
@@ -416,10 +211,10 @@ Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
Array<T> out = createEmptyArray<T>(oDims);
- auto reorderFunc = [=] (Array<T> out, Array<convT> packed,
- const Array<T> filter, const dim_t sig_hald_d0, const dim_t fftScale,
- const dim4 sig_tmp_dims, const dim4 sig_tmp_strides,
- const dim4 filter_tmp_dims, const dim4 filter_tmp_strides) {
+ auto reorderFunc = [=](Array<T> out, Array<convT> packed,
+ const Array<T> filter, const dim_t sig_hald_d0, const dim_t fftScale,
+ const dim4 sig_tmp_dims, const dim4 sig_tmp_strides,
+ const dim4 filter_tmp_dims, const dim4 filter_tmp_strides) {
T* out_ptr = out.get();
const af::dim4 out_dims = out.dims();
const af::dim4 out_strides = out.strides();
@@ -432,17 +227,16 @@ Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter,
// Reorder the output
if (kind == CONVOLVE_BATCH_KERNEL) {
- reorderOutput<T, convT, roundOut>
- (out_ptr, out_dims, out_strides,
- filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides,
- filter_dims, sig_half_d0, baseDim, fftScale, expand);
+ kernel::reorderHelper<T, convT, roundOut>(out_ptr, out_dims, out_strides,
+ filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides,
+ filter_dims, sig_half_d0, baseDim, fftScale, expand);
} else {
- reorderOutput<T, convT, roundOut>
- (out_ptr, out_dims, out_strides,
- sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides,
- filter_dims, sig_half_d0, baseDim, fftScale, expand);
+ kernel::reorderHelper<T, convT, roundOut>(out_ptr, out_dims, out_strides,
+ sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides,
+ filter_dims, sig_half_d0, baseDim, fftScale, expand);
}
};
+
getQueue().enqueue(reorderFunc, out, packed, filter, sig_half_d0, fftScale,
sig_tmp_dims, sig_tmp_strides,
filter_tmp_dims, filter_tmp_strides);
diff --git a/src/backend/cpu/kernel/approx1.hpp b/src/backend/cpu/kernel/approx1.hpp
index 63bae2d..51c4804 100644
--- a/src/backend/cpu/kernel/approx1.hpp
+++ b/src/backend/cpu/kernel/approx1.hpp
@@ -9,6 +9,7 @@
#include <Array.hpp>
#include <math.hpp>
+
namespace cpu
{
namespace kernel
diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/kernel/bilateral.hpp
similarity index 67%
copy from src/backend/cpu/bilateral.cpp
copy to src/backend/cpu/kernel/bilateral.hpp
index ea38ea7..2b5764f 100644
--- a/src/backend/cpu/bilateral.cpp
+++ b/src/backend/cpu/kernel/bilateral.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,37 +7,27 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <bilateral.hpp>
-#include <cmath>
-#include <algorithm>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
namespace cpu
{
+namespace kernel
+{
-static inline dim_t clamp(int a, dim_t mn, dim_t mx)
+inline
+dim_t clamp(int a, dim_t mn, dim_t mx)
{
return (a < (int)mn ? mn : (a > (int)mx ? mx : a));
}
-static inline unsigned getIdx(const dim4 &strides,
- int i, int j = 0, int k = 0, int l = 0)
+inline
+unsigned getIdx(const dim4 &strides, int i, int j = 0, int k = 0, int l = 0)
{
- return (l * strides[3] +
- k * strides[2] +
- j * strides[1] +
- i * strides[0]);
+ return (l * strides[3] + k * strides[2] + j * strides[1] + i * strides[0]);
}
template<typename inType, typename outType, bool isColor>
-void bilateral_(Array<outType> out, const Array<inType> in, float s_sigma, float c_sigma)
+void bilateral(Array<outType> out, const Array<inType> in, float s_sigma, float c_sigma)
{
const dim4 dims = in.dims();
const dim4 istrides = in.strides();
@@ -96,27 +86,5 @@ void bilateral_(Array<outType> out, const Array<inType> in, float s_sigma, float
}
}
-template<typename inType, typename outType, bool isColor>
-Array<outType> bilateral(const Array<inType> &in, const float &s_sigma, const float &c_sigma)
-{
- in.eval();
- const dim4 dims = in.dims();
- Array<outType> out = createEmptyArray<outType>(dims);
- getQueue().enqueue(bilateral_<inType, outType, isColor>, out, in, s_sigma, c_sigma);
- return out;
}
-
-#define INSTANTIATE(inT, outT)\
-template Array<outT> bilateral<inT, outT,true >(const Array<inT> &in, const float &s_sigma, const float &c_sigma);\
-template Array<outT> bilateral<inT, outT,false>(const Array<inT> &in, const float &s_sigma, const float &c_sigma);
-
-INSTANTIATE(double, double)
-INSTANTIATE(float , float)
-INSTANTIATE(char , float)
-INSTANTIATE(int , float)
-INSTANTIATE(uint , float)
-INSTANTIATE(uchar , float)
-INSTANTIATE(short , float)
-INSTANTIATE(ushort, float)
-
}
diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/kernel/convolve.hpp
similarity index 62%
copy from src/backend/cpu/convolve.cpp
copy to src/backend/cpu/kernel/convolve.hpp
index 239b4f0..d39acb6 100644
--- a/src/backend/cpu/convolve.cpp
+++ b/src/backend/cpu/kernel/convolve.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,20 +7,14 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <convolve.hpp>
-#include <err_cpu.hpp>
-#include <math.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
namespace cpu
{
+namespace kernel
+{
+
+using af::dim4;
template<typename T, typename accT, bool expand>
void one2one_1d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
@@ -129,11 +123,20 @@ void one2one_3d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims,
}
template<typename T, typename accT, dim_t baseDim, bool expand>
-void convolve_nd(T *optr, T const *iptr, accT const *fptr,
- dim4 const &oDims, dim4 const &sDims, dim4 const &fDims,
- dim4 const &oStrides, dim4 const &sStrides, dim4 const &fStrides,
- ConvolveBatchKind kind)
+void convolve_nd(Array<T> out, Array<T> const signal, Array<accT> const filter, ConvolveBatchKind kind)
{
+ T * optr = out.get();
+ T const * const iptr = signal.get();
+ accT const * const fptr = filter.get();
+
+ dim4 const oDims = out.dims();
+ dim4 const sDims = signal.dims();
+ dim4 const fDims = filter.dims();
+
+ dim4 const oStrides = out.strides();
+ dim4 const sStrides = signal.strides();
+ dim4 const fStrides = filter.strides();
+
dim_t out_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
dim_t in_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */
@@ -180,41 +183,6 @@ void convolve_nd(T *optr, T const *iptr, accT const *fptr,
}
}
-template<typename T, typename accT, dim_t baseDim, bool expand>
-Array<T> convolve(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind)
-{
- signal.eval();
- filter.eval();
-
- auto sDims = signal.dims();
- auto fDims = filter.dims();
- auto sStrides = signal.strides();
-
- dim4 oDims(1);
- if (expand) {
- for(dim_t d=0; d<4; ++d) {
- if (kind==CONVOLVE_BATCH_NONE || kind==CONVOLVE_BATCH_KERNEL) {
- oDims[d] = sDims[d]+fDims[d]-1;
- } else {
- oDims[d] = (d<baseDim ? sDims[d]+fDims[d]-1 : sDims[d]);
- }
- }
- } else {
- oDims = sDims;
- if (kind==CONVOLVE_BATCH_KERNEL) {
- for (dim_t i=baseDim; i<4; ++i)
- oDims[i] = fDims[i];
- }
- }
-
- Array<T> out = createEmptyArray<T>(oDims);
-
- getQueue().enqueue(convolve_nd<T, accT, baseDim, expand>,out.get(), signal.get(), filter.get(),
- oDims, sDims, fDims, out.strides(), sStrides, filter.strides(), kind);
-
- return out;
-}
-
template<typename T, typename accT, dim_t conv_dim, bool expand>
void convolve2_separable(T *optr, T const *iptr, accT const *fptr,
dim4 const &oDims, dim4 const &sDims, dim4 const &orgDims, dim_t fDim,
@@ -256,86 +224,44 @@ void convolve2_separable(T *optr, T const *iptr, accT const *fptr,
}
template<typename T, typename accT, bool expand>
-Array<T> convolve2(Array<T> const& signal, Array<accT> const& c_filter, Array<accT> const& r_filter)
+void convolve2(Array<T> out, Array<T> const signal,
+ Array<accT> const c_filter, Array<accT> const r_filter,
+ dim4 const tDims)
{
- signal.eval();
- c_filter.eval();
- r_filter.eval();
+ Array<T> temp = createEmptyArray<T>(tDims);
- auto sDims = signal.dims();
- auto cfDims = c_filter.dims();
- auto rfDims = r_filter.dims();
- auto sStrides = signal.strides();
-
- dim_t cflen = (dim_t)cfDims.elements();
- dim_t rflen = (dim_t)rfDims.elements();
+ dim_t cflen = (dim_t)c_filter.elements();
+ dim_t rflen = (dim_t)r_filter.elements();
- dim4 tDims = sDims;
- dim4 oDims = sDims;
-
- if (expand) {
- // separable convolve only does CONVOLVE_BATCH_NONE and standard batch(CONVOLVE_BATCH_SIGNAL)
- tDims[0] += cflen - 1;
- oDims[0] += cflen - 1;
- oDims[1] += rflen - 1;
- }
+ auto oDims = out.dims();
+ auto sDims = signal.dims();
- Array<T> out = createEmptyArray<T>(oDims);
-
- auto func = [=] (Array<T> out) {
- Array<T> temp = createEmptyArray<T>(tDims);
- auto tStrides = temp.strides();
- auto oStrides = out.strides();
+ auto oStrides = out.strides();
+ auto sStrides = signal.strides();
+ auto tStrides = temp.strides();
- for (dim_t b3=0; b3<oDims[3]; ++b3) {
+ for (dim_t b3=0; b3<oDims[3]; ++b3) {
- dim_t i_b3Off = b3*sStrides[3];
- dim_t t_b3Off = b3*tStrides[3];
- dim_t o_b3Off = b3*oStrides[3];
+ dim_t i_b3Off = b3*sStrides[3];
+ dim_t t_b3Off = b3*tStrides[3];
+ dim_t o_b3Off = b3*oStrides[3];
- for (dim_t b2=0; b2<oDims[2]; ++b2) {
+ for (dim_t b2=0; b2<oDims[2]; ++b2) {
- T const *iptr = signal.get()+ b2*sStrides[2] + i_b3Off;
- T *tptr = temp.get() + b2*tStrides[2] + t_b3Off;
- T *optr = out.get() + b2*oStrides[2] + o_b3Off;
+ T const *iptr = signal.get()+ b2*sStrides[2] + i_b3Off;
+ T *tptr = temp.get() + b2*tStrides[2] + t_b3Off;
+ T *optr = out.get() + b2*oStrides[2] + o_b3Off;
- convolve2_separable<T, accT, 0, expand>(tptr, iptr, c_filter.get(),
- tDims, sDims, sDims, cflen,
- tStrides, sStrides, c_filter.strides()[0]);
+ convolve2_separable<T, accT, 0, expand>(tptr, iptr, c_filter.get(),
+ tDims, sDims, sDims, cflen,
+ tStrides, sStrides, c_filter.strides()[0]);
- convolve2_separable<T, accT, 1, expand>(optr, tptr, r_filter.get(),
- oDims, tDims, sDims, rflen,
- oStrides, tStrides, r_filter.strides()[0]);
- }
+ convolve2_separable<T, accT, 1, expand>(optr, tptr, r_filter.get(),
+ oDims, tDims, sDims, rflen,
+ oStrides, tStrides, r_filter.strides()[0]);
}
- };
-
- getQueue().enqueue(func, out);
-
- return out;
+ }
}
-#define INSTANTIATE(T, accT) \
- template Array<T> convolve <T, accT, 1, true >(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind); \
- template Array<T> convolve <T, accT, 1, false>(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind); \
- template Array<T> convolve <T, accT, 2, true >(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind); \
- template Array<T> convolve <T, accT, 2, false>(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind); \
- template Array<T> convolve <T, accT, 3, true >(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind); \
- template Array<T> convolve <T, accT, 3, false>(Array<T> const& signal, Array<accT> const& filter, ConvolveBatchKind kind); \
- template Array<T> convolve2<T, accT, true >(Array<T> const& signal, Array<accT> const& c_filter, Array<accT> const& r_filter); \
- template Array<T> convolve2<T, accT, false>(Array<T> const& signal, Array<accT> const& c_filter, Array<accT> const& r_filter);
-
-INSTANTIATE(cdouble, cdouble)
-INSTANTIATE(cfloat , cfloat)
-INSTANTIATE(double , double)
-INSTANTIATE(float , float)
-INSTANTIATE(uint , float)
-INSTANTIATE(int , float)
-INSTANTIATE(uchar , float)
-INSTANTIATE(char , float)
-INSTANTIATE(ushort , float)
-INSTANTIATE(short , float)
-INSTANTIATE(uintl , float)
-INSTANTIATE(intl , float)
-
+}
}
diff --git a/src/backend/cpu/kernel/fftconvolve.hpp b/src/backend/cpu/kernel/fftconvolve.hpp
new file mode 100644
index 0000000..30bac66
--- /dev/null
+++ b/src/backend/cpu/kernel/fftconvolve.hpp
@@ -0,0 +1,227 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+#include <convolve_common.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+using af::dim4;
+
+template<typename To, typename Ti>
+void packData(Array<To> out, const af::dim4 od, const af::dim4 os, Array<Ti> const in)
+{
+ To* out_ptr = out.get();
+
+ const af::dim4 id = in.dims();
+ const af::dim4 is = in.strides();
+ const Ti* in_ptr = in.get();
+
+ int id0_half = divup(id[0], 2);
+ bool odd_id0 = (id[0] % 2 == 1);
+
+ for (int d3 = 0; d3 < (int)od[3]; d3++) {
+ for (int d2 = 0; d2 < (int)od[2]; d2++) {
+ for (int d1 = 0; d1 < (int)od[1]; d1++) {
+ for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
+ const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+
+ if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
+ const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
+ out_ptr[oidx] = (To)in_ptr[iidx];
+ if (d0 == id0_half-1 && odd_id0)
+ out_ptr[oidx+1] = (To)0;
+ else
+ out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half];
+ }
+ else {
+ // Pad remaining elements with 0s
+ out_ptr[oidx] = (To)0;
+ out_ptr[oidx+1] = (To)0;
+ }
+ }
+ }
+ }
+ }
+}
+
+template<typename To, typename Ti>
+void padArray(Array<To> out, const af::dim4 od, const af::dim4 os,
+ Array<Ti> const in, const dim_t offset)
+{
+ To* out_ptr = out.get() + offset;
+ const af::dim4 id = in.dims();
+ const af::dim4 is = in.strides();
+ const Ti* in_ptr = in.get();
+
+ for (int d3 = 0; d3 < (int)od[3]; d3++) {
+ for (int d2 = 0; d2 < (int)od[2]; d2++) {
+ for (int d1 = 0; d1 < (int)od[1]; d1++) {
+ for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
+ const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+
+ if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) {
+ // Copy input elements to real elements, set imaginary elements to 0
+ const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0;
+ out_ptr[oidx] = (To)in_ptr[iidx];
+ out_ptr[oidx+1] = (To)0;
+ }
+ else {
+ // Pad remaining of the matrix to 0s
+ out_ptr[oidx] = (To)0;
+ out_ptr[oidx+1] = (To)0;
+ }
+ }
+ }
+ }
+ }
+}
+
+template<typename T>
+void complexMultiply(Array<T> packed, const af::dim4 sig_dims, const af::dim4 sig_strides,
+ const af::dim4 fit_dims, const af::dim4 fit_strides,
+ ConvolveBatchKind kind, const dim_t offset)
+{
+ T* out_ptr = packed.get() + (kind==CONVOLVE_BATCH_KERNEL? offset : 0);
+ T* in1_ptr = packed.get();
+ T* in2_ptr = packed.get() + offset;
+
+ const dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims);
+ const dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides);
+ const dim4& i1d = sig_dims;
+ const dim4& i2d = fit_dims;
+ const dim4& i1s = sig_strides;
+ const dim4& i2s = fit_strides;
+
+ for (int d3 = 0; d3 < (int)od[3]; d3++) {
+ for (int d2 = 0; d2 < (int)od[2]; d2++) {
+ for (int d1 = 0; d1 < (int)od[1]; d1++) {
+ for (int d0 = 0; d0 < (int)od[0] / 2; d0++) {
+ if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) {
+ // Complex multiply each signal to equivalent filter
+ const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+ const int iidx = ridx + 1;
+
+ T a = in1_ptr[ridx];
+ T b = in1_ptr[iidx];
+ T c = in2_ptr[ridx];
+ T d = in2_ptr[iidx];
+
+ T ac = a*c;
+ T bd = b*d;
+
+ out_ptr[ridx] = ac - bd;
+ out_ptr[iidx] = (a+b) * (c+d) - ac - bd;
+ }
+ else if (kind == CONVOLVE_BATCH_SIGNAL) {
+ // Complex multiply all signals to filter
+ const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+ const int iidx1 = ridx1 + 1;
+ const int ridx2 = ridx1 % (i2s[3] * i2d[3]);
+ const int iidx2 = iidx1 % (i2s[3] * i2d[3]);
+
+ T a = in1_ptr[ridx1];
+ T b = in1_ptr[iidx1];
+ T c = in2_ptr[ridx2];
+ T d = in2_ptr[iidx2];
+
+ T ac = a*c;
+ T bd = b*d;
+
+ out_ptr[ridx1] = ac - bd;
+ out_ptr[iidx1] = (a+b) * (c+d) - ac - bd;
+ }
+ else if (kind == CONVOLVE_BATCH_KERNEL) {
+ // Complex multiply signal to all filters
+ const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2;
+ const int iidx2 = ridx2 + 1;
+ const int ridx1 = ridx2 % (i1s[3] * i1d[3]);
+ const int iidx1 = iidx2 % (i1s[3] * i1d[3]);
+
+ T a = in1_ptr[ridx1];
+ T b = in1_ptr[iidx1];
+ T c = in2_ptr[ridx2];
+ T d = in2_ptr[iidx2];
+
+ T ac = a*c;
+ T bd = b*d;
+
+ out_ptr[ridx2] = ac - bd;
+ out_ptr[iidx2] = (a+b) * (c+d) - ac - bd;
+ }
+ }
+ }
+ }
+ }
+}
+
+template<typename To, typename Ti, bool roundOut>
+void reorderHelper(To* out_ptr, const af::dim4& od, const af::dim4& os,
+ const Ti* in_ptr, const af::dim4& id, const af::dim4& is,
+ const af::dim4& fd, const int half_di0, const int baseDim,
+ const int fftScale, const bool expand)
+{
+ for (int d3 = 0; d3 < (int)od[3]; d3++) {
+ for (int d2 = 0; d2 < (int)od[2]; d2++) {
+ for (int d1 = 0; d1 < (int)od[1]; d1++) {
+ for (int d0 = 0; d0 < (int)od[0]; d0++) {
+ int id0, id1, id2, id3;
+ if (expand) {
+ id0 = d0;
+ id1 = d1 * is[1];
+ id2 = d2 * is[2];
+ id3 = d3 * is[3];
+ }
+ else {
+ id0 = d0 + fd[0]/2;
+ id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1];
+ id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2];
+ id3 = d3 * is[3];
+ }
+
+ int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0;
+
+ // Divide output elements to cuFFT resulting scale, round result if output
+ // type is single or double precision floating-point
+ if (id0 < half_di0) {
+ // Copy top elements
+ int iidx = id3 + id2 + id1 + id0 * 2;
+ if (roundOut)
+ out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
+ else
+ out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
+ }
+ else if (id0 < half_di0 + (int)fd[0] - 1) {
+ // Add signal and filter elements to central part
+ int iidx1 = id3 + id2 + id1 + id0 * 2;
+ int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
+ if (roundOut)
+ out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale));
+ else
+ out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale);
+ }
+ else {
+ // Copy bottom elements
+ const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1;
+ if (roundOut)
+ out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale));
+ else
+ out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale);
+ }
+ }
+ }
+ }
+ }
+}
+
+}
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list