[arrayfire] 204/408: FEAT/TEST: Adding R2C and C2R FFT transforms for all backends

Mon Sep 21 19:11:58 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 155293f8875365de38715e34aa96eb6ab7d3d932
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Thu Aug 6 23:08:15 2015 -0400

    FEAT/TEST: Adding R2C and C2R FFT transforms for all backends
    
    - These transforms use lower memory foot prints, exploiting symmetry
    - Added relevant tests
---
 include/af/signal.h         |  26 +++++++++
 src/api/c/fft.cpp           | 101 +++++++++++++++++++++++++++++++++++
 src/api/c/fft_common.hpp    |  42 +++++++++++++++
 src/api/cpp/fft.cpp         | 101 +++++++++++++++++++++++++++++++++++
 src/backend/cpu/fft.cpp     | 127 ++++++++++++++++++++++++++++++++++++++++++--
 src/backend/cpu/fft.hpp     |   6 +++
 src/backend/cuda/copy.cu    |   1 +
 src/backend/cuda/fft.cpp    | 107 +++++++++++++++++++++++++++++++++++--
 src/backend/cuda/fft.hpp    |   6 +++
 src/backend/cuda/memory.cpp |  11 ++++
 src/backend/opencl/fft.cpp  | 125 +++++++++++++++++++++++++++++++++++++++----
 src/backend/opencl/fft.hpp  |   6 +++
 test/fft_real.cpp           | 120 +++++++++++++++++++++++++++++++++++++++++
 13 files changed, 762 insertions(+), 17 deletions(-)

diff --git a/include/af/signal.h b/include/af/signal.h
index fbbd277..35edb79 100644
--- a/include/af/signal.h
+++ b/include/af/signal.h
@@ -370,6 +370,20 @@ AFAPI array idft(const array& in, const dim4 outDims);
  */
 AFAPI array idft(const array& in);
 
+template<int rank>
+array fftR2C(const array &in,
+             const dim4& dims,
+             const double norm_factor = 0);
+
+
+template<int rank>
+array fftR2C(const array &in,
+             const double norm_factor = 0);
+
+template<int rank>
+array fftC2R(const array &in, bool is_odd = false,
+                 const double norm_factor = 0);
+
 /**
    C++ Interface for convolution any(one through three) dimensional data
 
@@ -753,6 +767,18 @@ AFAPI af_err af_ifft3(af_array *out, const af_array in, const double norm_factor
 */
 AFAPI af_err af_ifft3_inplace(af_array in, const double norm_factor);
 
+AFAPI af_err af_fft_r2c (af_array *out, const af_array in, const double norm_factor, const dim_t pad0);
+
+AFAPI af_err af_fft2_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t pad0, const dim_t pad1);
+
+AFAPI af_err af_fft3_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t pad0, const dim_t pad1, const dim_t pad2);
+
+AFAPI af_err af_fft_c2r (af_array *out, const af_array in, const double norm_factor, const bool is_odd);
+
+AFAPI af_err af_fft2_c2r(af_array *out, const af_array in, const double norm_factor, const bool is_odd);
+
+AFAPI af_err af_fft3_c2r(af_array *out, const af_array in, const double norm_factor, const bool is_odd);
+
 
 /**
    C Interface for convolution on one dimensional data
diff --git a/src/api/c/fft.cpp b/src/api/c/fft.cpp
index 1bc3df6..d472cdb 100644
--- a/src/api/c/fft.cpp
+++ b/src/api/c/fft.cpp
@@ -146,3 +146,104 @@ af_err af_ifft3_inplace(af_array in, const double norm_factor)
 {
     return fft_inplace<3, false>(in, norm_factor);
 }
+
+template<typename inType, typename outType, int rank>
+static af_array fft_r2c(const af_array in, const double norm_factor,
+                        const dim_t npad, const dim_t  * const pad)
+{
+    return getHandle(fft_r2c<inType, outType, rank>(getArray<inType>(in),
+                                                    norm_factor, npad, pad));
+}
+
+template<int rank>
+static af_err fft_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t npad, const dim_t * const pad)
+{
+    try {
+        ArrayInfo info = getInfo(in);
+        af_dtype type  = info.getType();
+        af::dim4 dims  = info.dims();
+
+        DIM_ASSERT(1, (dims.ndims()>=rank));
+
+        af_array output;
+        switch(type) {
+            case f32: output = fft_r2c<float , cfloat  , rank>(in, norm_factor, npad, pad); break;
+            case f64: output = fft_r2c<double, cdouble , rank>(in, norm_factor, npad, pad); break;
+        default: {
+            TYPE_ERROR(1, type);
+        }
+        }
+        std::swap(*out,output);
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+af_err af_fft_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t pad0)
+{
+    const dim_t pad[1] = {pad0};
+    return fft_r2c<1>(out, in, norm_factor, (pad0>0?1:0), pad);
+}
+
+af_err af_fft2_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t pad0, const dim_t pad1)
+{
+    const dim_t pad[2] = {pad0, pad1};
+    return fft_r2c<2>(out, in, norm_factor, (pad0>0&&pad1>0?2:0), pad);
+}
+
+af_err af_fft3_r2c(af_array *out, const af_array in, const double norm_factor, const dim_t pad0, const dim_t pad1, const dim_t pad2)
+{
+    const dim_t pad[3] = {pad0, pad1, pad2};
+    return fft_r2c<3>(out, in, norm_factor, (pad0>0&&pad1>0&&pad2>0?3:0), pad);
+}
+
+
+template<typename inType, typename outType, int rank>
+static af_array fft_c2r(const af_array in, const double norm_factor,
+                        const dim4 &odims)
+{
+    return getHandle(fft_c2r<inType, outType, rank>(getArray<inType>(in),
+                                                    norm_factor, odims));
+}
+
+template<int rank>
+static af_err fft_c2r(af_array *out, const af_array in, const double norm_factor, const bool is_odd)
+{
+    try {
+        ArrayInfo info = getInfo(in);
+        af_dtype type  = info.getType();
+        af::dim4 idims  = info.dims();
+
+        DIM_ASSERT(1, (idims.ndims()>=rank));
+
+        dim4 odims = idims;
+        odims[0] = 2 * (odims[0] - 1) + (is_odd ? 1 : 0);
+
+        af_array output;
+        switch(type) {
+            case c32: output = fft_c2r<cfloat , float  , rank>(in, norm_factor, odims); break;
+            case c64: output = fft_c2r<cdouble, double , rank>(in, norm_factor, odims); break;
+            default: TYPE_ERROR(1, type);
+        }
+        std::swap(*out,output);
+    }
+    CATCHALL;
+
+    return AF_SUCCESS;
+}
+
+af_err af_fft_c2r(af_array *out, const af_array in, const double norm_factor, const bool is_odd)
+{
+    return fft_c2r<1>(out, in, norm_factor, is_odd);
+}
+
+af_err af_fft2_c2r(af_array *out, const af_array in, const double norm_factor, const bool is_odd)
+{
+    return fft_c2r<2>(out, in, norm_factor, is_odd);
+}
+
+af_err af_fft3_c2r(af_array *out, const af_array in, const double norm_factor, const bool is_odd)
+{
+    return fft_c2r<3>(out, in, norm_factor, is_odd);
+}
diff --git a/src/api/c/fft_common.hpp b/src/api/c/fft_common.hpp
index 8c4de85..bc6af36 100644
--- a/src/api/c/fft_common.hpp
+++ b/src/api/c/fft_common.hpp
@@ -34,3 +34,45 @@ Array<outType> fft(const Array<inType> input, const double norm_factor,
 
     return output;
 }
+
+template<typename inType, typename outType, int rank>
+Array<outType> fft_r2c(const Array<inType> input, const double norm_factor,
+                       const dim_t npad, const dim_t  * const pad)
+{
+    dim4 idims = input.dims();
+
+    bool is_pad = false;
+    for (int i = 0; i < npad; i++) {
+        is_pad |= (pad[i] != idims[i]);
+    }
+
+    Array<inType> tmp = input;
+
+    if (is_pad) {
+        dim4 pdims(1);
+        computePaddedDims(pdims, input.dims(), npad, pad);
+        tmp = padArray<inType, inType>(input, pdims, scalar<inType>(0), norm_factor);
+    }
+
+    Array<outType> output = fft_r2c<outType, inType, rank>(tmp);
+    if (!is_pad && norm_factor != 1) {
+        // Normalize input because tmp was not normalized
+        multiply_inplace(output, norm_factor);
+    }
+
+    return output;
+}
+
+template<typename inType, typename outType, int rank>
+Array<outType> fft_c2r(const Array<inType> input, const double norm_factor,
+                       const dim4 &odims)
+{
+    Array<outType> output = fft_c2r<outType, inType, rank>(input, odims);
+
+    if (norm_factor != 1) {
+        // Normalize input because tmp was not normalized
+        multiply_inplace(output, norm_factor);
+    }
+
+    return output;
+}
diff --git a/src/api/cpp/fft.cpp b/src/api/cpp/fft.cpp
index fe92549..1fe4a09 100644
--- a/src/api/cpp/fft.cpp
+++ b/src/api/cpp/fft.cpp
@@ -180,4 +180,105 @@ void ifft3InPlace(array& in, const double norm_factor)
     AF_THROW(af_ifft3_inplace(in.get(), norm));
 }
 
+template<>
+AFAPI array fftR2C<1>(const array &in, const dim4 &dims,
+                      const double norm_factor)
+{
+    af_array res;
+    AF_THROW(af_fft_r2c(&res, in.get(), norm_factor == 0 ? 1.0 : norm_factor, dims[0]));
+    return array(res);
+}
+
+template<>
+AFAPI array fftR2C<2>(const array &in, const dim4 &dims,
+                      const double norm_factor)
+{
+    af_array res;
+    AF_THROW(af_fft2_r2c(&res, in.get(), norm_factor == 0 ? 1.0 : norm_factor, dims[0], dims[1]));
+    return array(res);
+}
+
+template<>
+AFAPI array fftR2C<3>(const array &in, const dim4 &dims,
+                      const double norm_factor)
+{
+    af_array res;
+    AF_THROW(af_fft3_r2c(&res, in.get(), norm_factor == 0 ? 1.0 : norm_factor,
+                         dims[0], dims[1], dims[2]));
+    return array(res);
+}
+
+inline dim_t getOrigDim(dim_t d, bool is_odd)
+{
+    return 2 * (d - 1) + (is_odd ? 1 : 0);
+}
+
+template<>
+AFAPI array fftC2R<1>(const array &in, const bool is_odd,
+                      const double norm_factor)
+{
+    double norm = norm_factor;
+
+    if (norm == 0) {
+        dim4 idims = in.dims();
+        dim_t dim0 = getOrigDim(idims[0], is_odd);
+        norm = 1.0/dim0;
+    }
+
+    af_array res;
+    AF_THROW(af_fft_c2r(&res, in.get(), norm, is_odd));
+    return array(res);
+}
+
+template<>
+AFAPI array fftC2R<2>(const array &in, const bool is_odd,
+                      const double norm_factor)
+{
+    double norm = norm_factor;
+
+    if (norm == 0) {
+        dim4 idims = in.dims();
+        dim_t dim0 = getOrigDim(idims[0], is_odd);
+        dim_t dim1 = idims[1];
+        norm = 1.0/(dim0 * dim1);
+    }
+
+
+    af_array res;
+    AF_THROW(af_fft2_c2r(&res, in.get(), norm, is_odd));
+    return array(res);
+}
+
+template<>
+AFAPI array fftC2R<3>(const array &in, const bool is_odd,
+                      const double norm_factor)
+{
+    double norm = norm_factor;
+
+    if (norm == 0) {
+        dim4 idims = in.dims();
+        dim_t dim0 = getOrigDim(idims[0], is_odd);
+        dim_t dim1 = idims[1];
+        dim_t dim2 = idims[2];
+        norm = 1.0/(dim0 * dim1 * dim2);
+    }
+
+    af_array res;
+    AF_THROW(af_fft3_c2r(&res, in.get(), norm, is_odd));
+    return array(res);
+}
+
+#define FFT_REAL(rank)                                      \
+    template<>                                              \
+    AFAPI array fftR2C<rank>(const array &in,               \
+                             const double norm_factor)      \
+    {                                                       \
+        return fftR2C<rank>(in, in.dims(), norm_factor);    \
+    }                                                       \
+                                                            \
+
+FFT_REAL(1)
+FFT_REAL(2)
+FFT_REAL(3)
+
 }
diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp
index b933ec4..e41c8a1 100644
--- a/src/backend/cpu/fft.cpp
+++ b/src/backend/cpu/fft.cpp
@@ -54,12 +54,12 @@ TRANSFORM(fftw, cdouble)
 template<typename T, int rank, bool direction>
 void fft_inplace(Array<T> &in)
 {
-    int in_dims[rank];
+    int t_dims[rank];
     int in_embed[rank];
 
     const dim4 idims = in.dims();
 
-    computeDims<rank>(in_dims  , idims);
+    computeDims<rank>(t_dims  , idims);
     computeDims<rank>(in_embed , in.getDataDims());
 
     const dim4 istrides = in.strides();
@@ -75,7 +75,7 @@ void fft_inplace(Array<T> &in)
     }
 
     plan = transform.create(rank,
-                            in_dims,
+                            t_dims,
                             (int)batch,
                             (ctype_t *)in.get(),
                             in_embed, (int)istrides[0],
@@ -88,7 +88,119 @@ void fft_inplace(Array<T> &in)
 
     transform.execute(plan);
     transform.destroy(plan);
+}
+
+template<typename To, typename Ti>
+struct fftw_real_transform;
+
+#define TRANSFORM_REAL(PRE, To, Ti, POST)                               \
+    template<>                                                          \
+    struct fftw_real_transform<To, Ti>                                  \
+    {                                                                   \
+        typedef PRE##_plan plan_t;                                      \
+        typedef PRE##_complex ctype_t;                                  \
+                                                                        \
+        template<typename... Args>                                      \
+            plan_t create(Args... args)                                 \
+        { return PRE##_plan_many_dft_##POST(args...); }                 \
+        void execute(plan_t plan) { return PRE##_execute(plan); }       \
+        void destroy(plan_t plan) { return PRE##_destroy_plan(plan); }  \
+    };                                                                  \
+
+
+TRANSFORM_REAL(fftwf, cfloat , float , r2c)
+TRANSFORM_REAL(fftw , cdouble, double, r2c)
+TRANSFORM_REAL(fftwf, float , cfloat , c2r)
+TRANSFORM_REAL(fftw , double, cdouble, c2r)
+
+template<typename Tc, typename Tr, int rank>
+Array<Tc> fft_r2c(const Array<Tr> &in)
+{
+    dim4 idims = in.dims();
+    dim4 odims = in.dims();
+
+    odims[0] = odims[0] / 2 + 1;
+
+    Array<Tc> out = createEmptyArray<Tc>(odims);
+
+    int t_dims[rank];
+    int in_embed[rank];
+    int out_embed[rank];
+
+    computeDims<rank>(t_dims  , idims);
+    computeDims<rank>(in_embed , in.getDataDims());
+    computeDims<rank>(out_embed , out.getDataDims());
+
+    const dim4 istrides = in.strides();
+    const dim4 ostrides = out.strides();
+
+    typedef typename fftw_real_transform<Tc, Tr>::ctype_t ctype_t;
+    typename fftw_real_transform<Tc, Tr>::plan_t plan;
+
+    fftw_real_transform<Tc, Tr> transform;
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= idims[i];
+    }
+
+    plan = transform.create(rank,
+                            t_dims,
+                            (int)batch,
+                            (Tr *)in.get(),
+                            in_embed, (int)istrides[0],
+                            (int)istrides[rank],
+                            (ctype_t *)out.get(),
+                            out_embed, (int)ostrides[0],
+                            (int)ostrides[rank],
+                            FFTW_ESTIMATE);
 
+    transform.execute(plan);
+    transform.destroy(plan);
+
+    return out;
+}
+
+template<typename Tr, typename Tc, int rank>
+Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
+{
+    Array<Tr> out = createEmptyArray<Tr>(odims);
+
+    int t_dims[rank];
+    int in_embed[rank];
+    int out_embed[rank];
+
+    computeDims<rank>(t_dims  , odims);
+    computeDims<rank>(in_embed , in.getDataDims());
+    computeDims<rank>(out_embed , out.getDataDims());
+
+    const dim4 istrides = in.strides();
+    const dim4 ostrides = out.strides();
+
+    typedef typename fftw_real_transform<Tr, Tc>::ctype_t ctype_t;
+    typename fftw_real_transform<Tr, Tc>::plan_t plan;
+
+    fftw_real_transform<Tr, Tc> transform;
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= odims[i];
+    }
+
+    plan = transform.create(rank,
+                            t_dims,
+                            (int)batch,
+                            (ctype_t *)in.get(),
+                            in_embed, (int)istrides[0],
+                            (int)istrides[rank],
+                            (Tr *)out.get(),
+                            out_embed, (int)ostrides[0],
+                            (int)ostrides[rank],
+                            FFTW_ESTIMATE);
+
+    transform.execute(plan);
+    transform.destroy(plan);
+    return out;
 }
 
 #define INSTANTIATE(T)                                      \
@@ -102,5 +214,14 @@ void fft_inplace(Array<T> &in)
     INSTANTIATE(cfloat )
     INSTANTIATE(cdouble)
 
+#define INSTANTIATE_REAL(Tr, Tc)                                        \
+    template Array<Tc> fft_r2c<Tc, Tr, 1>(const Array<Tr> &in);         \
+    template Array<Tc> fft_r2c<Tc, Tr, 2>(const Array<Tr> &in);         \
+    template Array<Tc> fft_r2c<Tc, Tr, 3>(const Array<Tr> &in);         \
+    template Array<Tr> fft_c2r<Tr, Tc, 1>(const Array<Tc> &in, const dim4 &odims); \
+    template Array<Tr> fft_c2r<Tr, Tc, 2>(const Array<Tc> &in, const dim4 &odims); \
+    template Array<Tr> fft_c2r<Tr, Tc, 3>(const Array<Tc> &in, const dim4 &odims); \
 
+    INSTANTIATE_REAL(float , cfloat )
+    INSTANTIATE_REAL(double, cdouble)
 }
diff --git a/src/backend/cpu/fft.hpp b/src/backend/cpu/fft.hpp
index 0414ecb..02f4c9a 100644
--- a/src/backend/cpu/fft.hpp
+++ b/src/backend/cpu/fft.hpp
@@ -14,4 +14,10 @@ namespace cpu
 
 template<typename T, int rank, bool direction>
 void fft_inplace(Array<T> &in);
+
+template<typename Tc, typename Tr, int rank>
+Array<Tc> fft_r2c(const Array<Tr> &in);
+
+template<typename Tr, typename Tc, int rank>
+Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims);
 }
diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu
index 0358bfa..43e132c 100644
--- a/src/backend/cuda/copy.cu
+++ b/src/backend/cuda/copy.cu
@@ -70,6 +70,7 @@ namespace cuda
         ARG_ASSERT(1, (in.ndims() == dims.ndims()));
         Array<outType> ret = createEmptyArray<outType>(dims);
         kernel::copy<inType, outType>(ret, in, in.ndims(), default_value, factor);
+        CUDA_CHECK(cudaDeviceSynchronize());
         return ret;
     }
 
diff --git a/src/backend/cuda/fft.cpp b/src/backend/cuda/fft.cpp
index ff90cb4..f5f8543 100644
--- a/src/backend/cuda/fft.cpp
+++ b/src/backend/cuda/fft.cpp
@@ -103,11 +103,13 @@ void find_cufft_plan(cufftHandle &plan, int rank, int *n,
             break;
         }
     }
+
     // return mHandles[plan_index] if plan_index valid
     if (plan_index!=-1) {
         plan = planner.mHandles[plan_index];
         return;
     }
+
     // otherwise create a new plan and set it at mAvailSlotIndex
     // and finally set it to output plan variable
     int slot_index = planner.mAvailSlotIndex;
@@ -116,6 +118,7 @@ void find_cufft_plan(cufftHandle &plan, int rank, int *n,
     cufftHandle temp;
     cufftResult res = cufftPlanMany(&temp, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch);
 
+
     // If plan creation fails, clean up the memory we hold on to and try again
     if (res != CUFFT_SUCCESS) {
         garbageCollect();
@@ -145,6 +148,26 @@ struct cufft_transform;
 CUFFT_FUNC(cfloat , C2C)
 CUFFT_FUNC(cdouble, Z2Z)
 
+template<typename To, typename Ti>
+struct cufft_real_transform;
+
+#define CUFFT_REAL_FUNC(To, Ti, TRANSFORM_TYPE)                 \
+    template<>                                                  \
+    struct cufft_real_transform<To, Ti>                         \
+    {                                                           \
+        enum { type = CUFFT_##TRANSFORM_TYPE };                 \
+        cufftResult                                             \
+            operator() (cufftHandle plan, Ti *in, To *out) {    \
+            return cufftExec##TRANSFORM_TYPE(plan, in, out);    \
+        }                                                       \
+    };
+
+CUFFT_REAL_FUNC(cfloat , float , R2C)
+CUFFT_REAL_FUNC(cdouble, double, D2Z)
+
+CUFFT_REAL_FUNC(float , cfloat , C2R)
+CUFFT_REAL_FUNC(double, cdouble, Z2D)
+
 template<int rank>
 void computeDims(int rdims[rank], const dim4 &idims)
 {
@@ -159,10 +182,10 @@ void fft_inplace(Array<T> &in)
     const dim4 idims    = in.dims();
     const dim4 istrides = in.strides();
 
-    int in_dims[rank];
+    int t_dims[rank];
     int in_embed[rank];
 
-    computeDims<rank>(in_dims, idims);
+    computeDims<rank>(t_dims, idims);
     computeDims<rank>(in_embed, in.getDataDims());
 
     int batch = 1;
@@ -171,7 +194,7 @@ void fft_inplace(Array<T> &in)
     }
 
     cufftHandle plan;
-    find_cufft_plan(plan, rank, in_dims,
+    find_cufft_plan(plan, rank, t_dims,
                     in_embed , istrides[0], istrides[rank],
                     in_embed , istrides[0], istrides[rank],
                     (cufftType)cufft_transform<T>::type, batch);
@@ -180,6 +203,74 @@ void fft_inplace(Array<T> &in)
     CUFFT_CHECK(transform(plan, (T *)in.get(), in.get(), direction ? CUFFT_FORWARD : CUFFT_INVERSE));
 }
 
+template<typename Tc, typename Tr, int rank>
+Array<Tc> fft_r2c(const Array<Tr> &in)
+{
+    dim4 idims = in.dims();
+    dim4 odims = in.dims();
+
+    odims[0] = odims[0] / 2 + 1;
+
+    Array<Tc> out = createEmptyArray<Tc>(odims);
+
+    int t_dims[rank];
+    int in_embed[rank], out_embed[rank];
+
+    computeDims<rank>(t_dims, idims);
+    computeDims<rank>(in_embed, in.getDataDims());
+    computeDims<rank>(out_embed, out.getDataDims());
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= idims[i];
+    }
+
+    dim4 istrides = in.strides();
+    dim4 ostrides = out.strides();
+
+    cufftHandle plan;
+    find_cufft_plan(plan, rank, t_dims,
+                    in_embed  , istrides[0], istrides[rank],
+                    out_embed , ostrides[0], ostrides[rank],
+                    (cufftType)cufft_real_transform<Tc, Tr>::type, batch);
+
+    cufft_real_transform<Tc, Tr> transform;
+    CUFFT_CHECK(transform(plan, (Tr *)in.get(), out.get()));
+    return out;
+}
+
+template<typename Tr, typename Tc, int rank>
+Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
+{
+    Array<Tr> out = createEmptyArray<Tr>(odims);
+
+    int t_dims[rank];
+    int in_embed[rank], out_embed[rank];
+
+    computeDims<rank>(t_dims, odims);
+    computeDims<rank>(in_embed, in.getDataDims());
+    computeDims<rank>(out_embed, out.getDataDims());
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= odims[i];
+    }
+
+    dim4 istrides = in.strides();
+    dim4 ostrides = out.strides();
+
+    cufft_real_transform<Tr, Tc> transform;
+
+    cufftHandle plan;
+    find_cufft_plan(plan, rank, t_dims,
+                    in_embed  , istrides[0], istrides[rank],
+                    out_embed , ostrides[0], ostrides[rank],
+                    (cufftType)cufft_real_transform<Tr, Tc>::type, batch);
+
+    CUFFT_CHECK(transform(plan, (Tc *)in.get(), out.get()));
+    return out;
+}
+
 #define INSTANTIATE(T)                                      \
     template void fft_inplace<T, 1, true >(Array<T> &in);   \
     template void fft_inplace<T, 2, true >(Array<T> &in);   \
@@ -191,4 +282,14 @@ void fft_inplace(Array<T> &in)
     INSTANTIATE(cfloat )
     INSTANTIATE(cdouble)
 
+#define INSTANTIATE_REAL(Tr, Tc)                                        \
+    template Array<Tc> fft_r2c<Tc, Tr, 1>(const Array<Tr> &in);         \
+    template Array<Tc> fft_r2c<Tc, Tr, 2>(const Array<Tr> &in);         \
+    template Array<Tc> fft_r2c<Tc, Tr, 3>(const Array<Tr> &in);         \
+    template Array<Tr> fft_c2r<Tr, Tc, 1>(const Array<Tc> &in, const dim4 &odims); \
+    template Array<Tr> fft_c2r<Tr, Tc, 2>(const Array<Tc> &in, const dim4 &odims); \
+    template Array<Tr> fft_c2r<Tr, Tc, 3>(const Array<Tc> &in, const dim4 &odims); \
+
+    INSTANTIATE_REAL(float , cfloat )
+    INSTANTIATE_REAL(double, cdouble)
 }
diff --git a/src/backend/cuda/fft.hpp b/src/backend/cuda/fft.hpp
index bac02d5..2d0ee2c 100644
--- a/src/backend/cuda/fft.hpp
+++ b/src/backend/cuda/fft.hpp
@@ -15,4 +15,10 @@ namespace cuda
 template<typename T, int rank, bool direction>
 void fft_inplace(Array<T> &out);
 
+template<typename Tc, typename Tr, int rank>
+Array<Tc> fft_r2c(const Array<Tr> &in);
+
+template<typename Tr, typename Tc, int rank>
+Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims);
+
 }
diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp
index 9e0da35..ee96b80 100644
--- a/src/backend/cuda/memory.cpp
+++ b/src/backend/cuda/memory.cpp
@@ -17,6 +17,8 @@
 #include <dispatch.hpp>
 #include <platform.hpp>
 
+#define AF_CUDA_MEM_DEBUG
+
 namespace cuda
 {
     static size_t memory_resolution = 1024; //1KB
@@ -89,6 +91,15 @@ namespace cuda
         pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is
     }
 
+    void garbageCollect()
+    {
+    }
+
+    void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+                          size_t *lock_bytes,  size_t *lock_buffers)
+    {
+    }
+
 #else
 
     // Manager Class
diff --git a/src/backend/opencl/fft.cpp b/src/backend/opencl/fft.cpp
index 1ce5ac0..d5922f0 100644
--- a/src/backend/opencl/fft.cpp
+++ b/src/backend/opencl/fft.cpp
@@ -20,8 +20,6 @@
 #include <string>
 #include <cstdio>
 #include <memory.hpp>
-#include <iostream>
-#include <handle.hpp>
 
 using af::dim4;
 using std::string;
@@ -36,6 +34,7 @@ namespace opencl
 class clFFTPlanner
 {
     friend void find_clfft_plan(clfftPlanHandle &plan,
+                                clfftLayout iLayout, clfftLayout oLayout,
                                 clfftDim rank, size_t *clLengths,
                                 size_t *istrides, size_t idist,
                                 size_t *ostrides, size_t odist,
@@ -72,6 +71,7 @@ class clFFTPlanner
 };
 
 void find_clfft_plan(clfftPlanHandle &plan,
+                     clfftLayout iLayout, clfftLayout oLayout,
                      clfftDim rank, size_t *clLengths,
                      size_t *istrides, size_t idist,
                      size_t *ostrides, size_t odist,
@@ -81,7 +81,7 @@ void find_clfft_plan(clfftPlanHandle &plan,
 
     // create the key string
     char key_str_temp[64];
-    sprintf(key_str_temp, "%d:", rank);
+    sprintf(key_str_temp, "%d:%d:%d:", iLayout, oLayout, rank);
 
     string key_string(key_str_temp);
 
@@ -142,13 +142,19 @@ void find_clfft_plan(clfftPlanHandle &plan,
     // Context() returns the actual cl_context handle
     CLFFT_CHECK(clfftCreateDefaultPlan(&temp, getContext()(), rank, clLengths));
 
-    CLFFT_CHECK(clfftSetLayout(temp, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
+    // complex to complex
+    if (iLayout == oLayout) {
+        CLFFT_CHECK(clfftSetResultLocation(temp, CLFFT_INPLACE));
+    } else {
+        CLFFT_CHECK(clfftSetResultLocation(temp, CLFFT_OUTOFPLACE));
+    }
+
+    CLFFT_CHECK(clfftSetLayout(temp, iLayout, oLayout));
     CLFFT_CHECK(clfftSetPlanBatchSize(temp, batch));
     CLFFT_CHECK(clfftSetPlanDistance(temp, idist, odist));
     CLFFT_CHECK(clfftSetPlanInStride(temp, rank, istrides));
     CLFFT_CHECK(clfftSetPlanOutStride(temp, rank, ostrides));
     CLFFT_CHECK(clfftSetPlanPrecision(temp, precision));
-    CLFFT_CHECK(clfftSetResultLocation(temp, CLFFT_INPLACE));
     CLFFT_CHECK(clfftSetPlanScale(temp, CLFFT_BACKWARD, 1.0));
 
     // getQueue() returns object of type CommandQueue
@@ -201,20 +207,22 @@ template<typename T, int rank, bool direction>
 void fft_inplace(Array<T> &in)
 {
     verifySupported<rank>(in.dims());
-    size_t idims[4], istrides[4], iembed[4];
+    size_t tdims[4], istrides[4];
 
-    computeDims(idims   , in.dims());
-    computeDims(iembed  , in.getDataDims());
+    computeDims(tdims   , in.dims());
     computeDims(istrides, in.strides());
 
     clfftPlanHandle plan;
 
     int batch = 1;
     for (int i = rank; i < 4; i++) {
-        batch *= idims[i];
+        batch *= tdims[i];
     }
 
-    find_clfft_plan(plan, (clfftDim)rank, idims,
+    find_clfft_plan(plan,
+                    CLFFT_COMPLEX_INTERLEAVED,
+                    CLFFT_COMPLEX_INTERLEAVED,
+                    (clfftDim)rank, tdims,
                     istrides, istrides[rank],
                     istrides, istrides[rank],
                     (clfftPrecision)Precision<T>::type,
@@ -229,14 +237,109 @@ void fft_inplace(Array<T> &in)
                                       &imem, &imem, NULL));
 }
 
+template<typename Tc, typename Tr, int rank>
+Array<Tc> fft_r2c(const Array<Tr> &in)
+{
+    dim4 odims = in.dims();
+
+    odims[0] = odims[0] / 2 + 1;
+
+    Array<Tc> out = createEmptyArray<Tc>(odims);
+
+    verifySupported<rank>(in.dims());
+    size_t tdims[4], istrides[4], ostrides[4];
+
+    computeDims(tdims   ,  in.dims());
+    computeDims(istrides,  in.strides());
+    computeDims(ostrides, out.strides());
+
+    clfftPlanHandle plan;
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= tdims[i];
+    }
+
+    find_clfft_plan(plan,
+                    CLFFT_REAL,
+                    CLFFT_HERMITIAN_INTERLEAVED,
+                    (clfftDim)rank, tdims,
+                    istrides, istrides[rank],
+                    ostrides, ostrides[rank],
+                    (clfftPrecision)Precision<Tc>::type,
+                    batch);
+
+    cl_mem imem = (*in.get())();
+    cl_mem omem = (*out.get())();
+    cl_command_queue queue = getQueue()();
+
+    CLFFT_CHECK(clfftEnqueueTransform(plan,
+                                      CLFFT_FORWARD,
+                                      1, &queue, 0, NULL, NULL,
+                                      &imem, &omem, NULL));
+
+    return out;
+}
+
+template<typename Tr, typename Tc, int rank>
+Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
+{
+    Array<Tr> out = createEmptyArray<Tr>(odims);
+
+    verifySupported<rank>(odims);
+    size_t tdims[4], istrides[4], ostrides[4];
+
+    computeDims(tdims   ,  odims);
+    computeDims(istrides,  in.strides());
+    computeDims(ostrides, out.strides());
+
+    clfftPlanHandle plan;
+
+    int batch = 1;
+    for (int i = rank; i < 4; i++) {
+        batch *= tdims[i];
+    }
+
+    find_clfft_plan(plan,
+                    CLFFT_HERMITIAN_INTERLEAVED,
+                    CLFFT_REAL,
+                    (clfftDim)rank, tdims,
+                    istrides, istrides[rank],
+                    ostrides, ostrides[rank],
+                    (clfftPrecision)Precision<Tc>::type,
+                    batch);
+
+    cl_mem imem = (*in.get())();
+    cl_mem omem = (*out.get())();
+    cl_command_queue queue = getQueue()();
+
+    CLFFT_CHECK(clfftEnqueueTransform(plan,
+                                      CLFFT_BACKWARD,
+                                      1, &queue, 0, NULL, NULL,
+                                      &imem, &omem, NULL));
+
+    return out;
+}
+
 #define INSTANTIATE(T)                                      \
     template void fft_inplace<T, 1, true >(Array<T> &in);   \
     template void fft_inplace<T, 2, true >(Array<T> &in);   \
     template void fft_inplace<T, 3, true >(Array<T> &in);   \
     template void fft_inplace<T, 1, false>(Array<T> &in);   \
     template void fft_inplace<T, 2, false>(Array<T> &in);   \
-    template void fft_inplace<T, 3, false>(Array<T> &in);
+    template void fft_inplace<T, 3, false>(Array<T> &in);   \
 
     INSTANTIATE(cfloat )
     INSTANTIATE(cdouble)
+
+#define INSTANTIATE_REAL(Tr, Tc)                                        \
+    template Array<Tc> fft_r2c<Tc, Tr, 1>(const Array<Tr> &in);         \
+    template Array<Tc> fft_r2c<Tc, Tr, 2>(const Array<Tr> &in);         \
+    template Array<Tc> fft_r2c<Tc, Tr, 3>(const Array<Tr> &in);         \
+    template Array<Tr> fft_c2r<Tr, Tc, 1>(const Array<Tc> &in, const dim4 &odims); \
+    template Array<Tr> fft_c2r<Tr, Tc, 2>(const Array<Tc> &in, const dim4 &odims); \
+    template Array<Tr> fft_c2r<Tr, Tc, 3>(const Array<Tc> &in, const dim4 &odims); \
+
+    INSTANTIATE_REAL(float , cfloat )
+    INSTANTIATE_REAL(double, cdouble)
 }
diff --git a/src/backend/opencl/fft.hpp b/src/backend/opencl/fft.hpp
index 3b48254..a1d9e6e 100644
--- a/src/backend/opencl/fft.hpp
+++ b/src/backend/opencl/fft.hpp
@@ -15,4 +15,10 @@ namespace opencl
 template<typename T, int rank, bool direction>
 void fft_inplace(Array<T> &in);
 
+template<typename Tc, typename Tr, int rank>
+Array<Tc> fft_r2c(const Array<Tr> &in);
+
+template<typename Tr, typename Tc, int rank>
+Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims);
+
 }
diff --git a/test/fft_real.cpp b/test/fft_real.cpp
new file mode 100644
index 0000000..c8d9a55
--- /dev/null
+++ b/test/fft_real.cpp
@@ -0,0 +1,120 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <gtest/gtest.h>
+#include <arrayfire.h>
+#include <af/dim4.hpp>
+#include <af/traits.hpp>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include <testHelpers.hpp>
+
+using std::string;
+using std::vector;
+using af::cfloat;
+using af::cdouble;
+
+
+template<typename T>
+class FFT_REAL : public ::testing::Test
+{
+};
+
+typedef ::testing::Types<af::cfloat, af::cdouble> TestTypes;
+TYPED_TEST_CASE(FFT_REAL, TestTypes);
+
+template<int rank>
+af::array fft(const af::array &in, double norm)
+{
+    switch(rank) {
+    case 1: return af::fftNorm(in, norm);
+    case 2: return af::fft2Norm(in, norm);
+    case 3: return af::fft3Norm(in, norm);
+    default: return in;
+    }
+}
+
+#define MY_ASSERT_NEAR(aa, bb, cc) ASSERT_NEAR(abs(aa), abs(bb), (cc))
+
+template<typename Tc, int rank>
+void fft_real(af::dim4 dims)
+{
+    typedef typename af::dtype_traits<Tc>::base_type Tr;
+    if (noDoubleTests<Tr>()) return;
+
+    af::dtype ty = (af::dtype)af::dtype_traits<Tr>::af_type;
+    af::array a = af::randu(dims, ty);
+
+    bool is_odd = dims[0] & 1;
+
+    int dim0 = dims[0] / 2 + 1;
+
+    double norm = 1;
+    for (int i = 0; i < rank; i++) norm *= dims[i];
+    norm = 1/norm;
+
+    af::array as = af::fftR2C<rank>(a, norm);
+    af::array af = fft<rank>(a, norm);
+
+
+    std::vector<Tc> has(as.elements());
+    std::vector<Tc> haf(af.elements());
+
+    as.host(&has[0]);
+    af.host(&haf[0]);
+
+    for (int j = 0; j < a.elements() / dims[0]; j++) {
+        for (int i = 0; i < dim0; i++) {
+            MY_ASSERT_NEAR(haf[j * dims[0] + i], has[j * dim0 + i], 1E-2) << "at " << j * dims[0] + i;
+        }
+    }
+
+    af::array b = af::fftC2R<rank>(as, is_odd, 1);
+
+    std::vector<Tr> ha(a.elements());
+    std::vector<Tr> hb(a.elements());
+
+    a.host(&ha[0]);
+    b.host(&hb[0]);
+
+    for (int j = 0; j < a.elements(); j++) {
+        ASSERT_NEAR(ha[j], hb[j], 1E-2);
+    }
+}
+
+TYPED_TEST(FFT_REAL, Even1D)
+{
+    fft_real<TypeParam, 1>(af::dim4(1024, 256));
+}
+
+TYPED_TEST(FFT_REAL, Odd1D)
+{
+    fft_real<TypeParam, 1>(af::dim4(625, 256));
+}
+
+TYPED_TEST(FFT_REAL, Even2D)
+{
+    fft_real<TypeParam, 2>(af::dim4(1024, 256));
+}
+
+TYPED_TEST(FFT_REAL, Odd2D)
+{
+    fft_real<TypeParam, 2>(af::dim4(625, 256));
+}
+
+TYPED_TEST(FFT_REAL, Even3D)
+{
+    fft_real<TypeParam, 3>(af::dim4(32, 32, 32));
+}
+
+TYPED_TEST(FFT_REAL, Odd3D)
+{
+    fft_real<TypeParam, 3>(af::dim4(25, 32, 32));
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git