[arrayfire] 81/284: Added missing eval for input Array's in cpu backend fns

Sun Feb 7 18:59:21 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.

commit 3ba9633e8d1dba645ba29695ef99e11a616ef2f2
Author: pradeep <pradeep at arrayfire.com>
Date:   Thu Dec 17 14:22:46 2015 -0500

    Added missing eval for input Array's in cpu backend fns
    
    This change has some style fixes related to cpu namespace
---
 src/backend/cpu/Array.cpp             |  402 ++++----
 src/backend/cpu/approx.cpp            |  584 +++++------
 src/backend/cpu/bilateral.cpp         |    1 +
 src/backend/cpu/cholesky.cpp          |    4 +
 src/backend/cpu/copy.cpp              |  278 ++---
 src/backend/cpu/diagonal.cpp          |  125 +--
 src/backend/cpu/diff.cpp              |  205 ++--
 src/backend/cpu/exampleFunction.cpp   |    7 +
 src/backend/cpu/fft.cpp               |    6 +
 src/backend/cpu/gradient.cpp          |    1 +
 src/backend/cpu/identity.cpp          |    2 +
 src/backend/cpu/iota.cpp              |    2 +
 src/backend/cpu/ireduce.cpp           |  282 ++---
 src/backend/cpu/lu.cpp                |    6 +
 src/backend/cpu/match_template.cpp    |    3 +
 src/backend/cpu/math.cpp              |   72 +-
 src/backend/cpu/meanshift.cpp         |    2 +
 src/backend/cpu/medfilt.cpp           |    2 +
 src/backend/cpu/memory.cpp            |  333 +++---
 src/backend/cpu/nearest_neighbour.cpp |    2 -
 src/backend/cpu/qr.cpp                |    6 +
 src/backend/cpu/range.cpp             |    2 +
 src/backend/cpu/reorder.cpp           |   98 +-
 src/backend/cpu/resize.cpp            |    2 +
 src/backend/cpu/rotate.cpp            |  178 ++--
 src/backend/cpu/set.cpp               |  178 ++--
 src/backend/cpu/shift.cpp             |    6 +-
 src/backend/cpu/sift_nonfree.hpp      | 1811 +++++++++++++++++----------------
 src/backend/cpu/sobel.cpp             |    1 +
 src/backend/cpu/solve.cpp             |   11 +-
 src/backend/cpu/sort.cpp              |   96 +-
 src/backend/cpu/sort_index.cpp        |  140 +--
 src/backend/cpu/susan.cpp             |    2 +
 src/backend/cpu/svd.cpp               |  147 +--
 src/backend/cpu/transform.cpp         |    4 +-
 src/backend/cpu/transpose.cpp         |    1 -
 src/backend/cpu/triangle.cpp          |   25 +-
 src/backend/cpu/unwrap.cpp            |    3 +-
 src/backend/cpu/where.cpp             |   81 +-
 39 files changed, 2606 insertions(+), 2505 deletions(-)

diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp
index 456f4c8..9c15bc4 100644
--- a/src/backend/cpu/Array.cpp
+++ b/src/backend/cpu/Array.cpp
@@ -21,250 +21,251 @@
 
 namespace cpu
 {
-    const int MAX_TNJ_LEN = 20;
-    using TNJ::BufferNode;
-    using TNJ::Node;
-    using TNJ::Node_ptr;
-
-    using af::dim4;
-
-    template<typename T>
-    Array<T>::Array(dim4 dims):
-        info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
-        data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
-    { }
-
-    template<typename T>
-    Array<T>::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device):
-        info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
-        data((is_device & !copy_device) ? (T*)in_data : memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
-        node(), offset(0), ready(true), owner(true)
-    {
-        static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
-        static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
-        if (!is_device || copy_device) {
-            std::copy(in_data, in_data + dims.elements(), data.get());
-        }
-    }
 
-    template<typename T>
-    Array<T>::Array(af::dim4 dims, TNJ::Node_ptr n) :
-        info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
-        data(), data_dims(dims),
-        node(n), offset(0), ready(false), owner(true)
-    {
+const int MAX_TNJ_LEN = 20;
+using TNJ::BufferNode;
+using TNJ::Node;
+using TNJ::Node_ptr;
+
+using af::dim4;
+
+template<typename T>
+Array<T>::Array(dim4 dims):
+    info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+    data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
+    node(), offset(0), ready(true), owner(true)
+{ }
+
+template<typename T>
+Array<T>::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device):
+    info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+    data((is_device & !copy_device) ? (T*)in_data : memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
+    node(), offset(0), ready(true), owner(true)
+{
+    static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
+    static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
+    if (!is_device || copy_device) {
+        std::copy(in_data, in_data + dims.elements(), data.get());
     }
+}
 
-    template<typename T>
-    Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) :
-        info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type),
-        data(parent.getData()), data_dims(parent.getDataDims()),
-        node(),
-        offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
-        ready(true), owner(false)
-    { }
+template<typename T>
+Array<T>::Array(af::dim4 dims, TNJ::Node_ptr n) :
+    info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+    data(), data_dims(dims),
+    node(n), offset(0), ready(false), owner(true)
+{
+}
 
+template<typename T>
+Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) :
+    info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type),
+    data(parent.getData()), data_dims(parent.getDataDims()),
+    node(),
+    offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
+    ready(true), owner(false)
+{ }
 
-    template<typename T>
-    void Array<T>::eval()
-    {
-        if (isReady()) return;
-        if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);
 
-        this->setId(getActiveDeviceId());
+template<typename T>
+void Array<T>::eval()
+{
+    if (isReady()) return;
+    if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);
 
-        data = std::shared_ptr<T>(memAlloc<T>(elements()), memFree<T>);
+    this->setId(getActiveDeviceId());
 
-        auto func = [] (Array<T> in) {
-            in.setId(getActiveDeviceId());
-            T *ptr = in.data.get();
+    data = std::shared_ptr<T>(memAlloc<T>(elements()), memFree<T>);
 
-            dim4 odims = in.dims();
-            dim4 ostrs = in.strides();
+    auto func = [] (Array<T> in) {
+        in.setId(getActiveDeviceId());
+        T *ptr = in.data.get();
 
-            bool is_linear = in.node->isLinear(odims.get());
+        dim4 odims = in.dims();
+        dim4 ostrs = in.strides();
 
-            if (is_linear) {
-                int num = in.elements();
-                for (int i = 0; i < num; i++) {
-                    ptr[i] = *(T *)in.node->calc(i);
-                }
-            } else {
-                for (int w = 0; w < (int)odims[3]; w++) {
-                    dim_t offw = w * ostrs[3];
+        bool is_linear = in.node->isLinear(odims.get());
+
+        if (is_linear) {
+            int num = in.elements();
+            for (int i = 0; i < num; i++) {
+                ptr[i] = *(T *)in.node->calc(i);
+            }
+        } else {
+            for (int w = 0; w < (int)odims[3]; w++) {
+                dim_t offw = w * ostrs[3];
 
-                    for (int z = 0; z < (int)odims[2]; z++) {
-                        dim_t offz = z * ostrs[2] + offw;
+                for (int z = 0; z < (int)odims[2]; z++) {
+                    dim_t offz = z * ostrs[2] + offw;
 
-                        for (int y = 0; y < (int)odims[1]; y++) {
-                            dim_t offy = y * ostrs[1] + offz;
+                    for (int y = 0; y < (int)odims[1]; y++) {
+                        dim_t offy = y * ostrs[1] + offz;
 
-                            for (int x = 0; x < (int)odims[0]; x++) {
-                                dim_t id = x + offy;
+                        for (int x = 0; x < (int)odims[0]; x++) {
+                            dim_t id = x + offy;
 
-                                ptr[id] = *(T *)in.node->calc(x, y, z, w);
-                            }
+                            ptr[id] = *(T *)in.node->calc(x, y, z, w);
                         }
                     }
                 }
             }
-        };
-
-        getQueue().enqueue(func, *this);
+        }
+    };
 
-        ready = true;
-        Node_ptr prev = node;
-        prev->reset();
-        // FIXME: Replace the current node in any JIT possible trees with the new BufferNode
-        node.reset();
-    }
+    getQueue().enqueue(func, *this);
 
-    template<typename T>
-    void Array<T>::eval() const
-    {
-        if (isReady()) return;
-        const_cast<Array<T> *>(this)->eval();
-    }
+    ready = true;
+    Node_ptr prev = node;
+    prev->reset();
+    // FIXME: Replace the current node in any JIT possible trees with the new BufferNode
+    node.reset();
+}
 
-    template<typename T>
-    Node_ptr Array<T>::getNode() const
-    {
-        if (!node) {
+template<typename T>
+void Array<T>::eval() const
+{
+    if (isReady()) return;
+    const_cast<Array<T> *>(this)->eval();
+}
 
-            unsigned bytes = this->getDataDims().elements() * sizeof(T);
+template<typename T>
+Node_ptr Array<T>::getNode() const
+{
+    if (!node) {
 
-            BufferNode<T> *buf_node = new BufferNode<T>(data,
-                                                        bytes,
-                                                        offset,
-                                                        dims().get(),
-                                                        strides().get(),
-                                                        isLinear());
+        unsigned bytes = this->getDataDims().elements() * sizeof(T);
 
-            const_cast<Array<T> *>(this)->node = Node_ptr(reinterpret_cast<Node *>(buf_node));
-        }
+        BufferNode<T> *buf_node = new BufferNode<T>(data,
+                                                    bytes,
+                                                    offset,
+                                                    dims().get(),
+                                                    strides().get(),
+                                                    isLinear());
 
-        return node;
+        const_cast<Array<T> *>(this)->node = Node_ptr(reinterpret_cast<Node *>(buf_node));
     }
 
-    template<typename T>
-    Array<T>
-    createHostDataArray(const dim4 &size, const T * const data)
-    {
-        return Array<T>(size, data, false);
-    }
+    return node;
+}
 
-    template<typename T>
-    Array<T>
-    createDeviceDataArray(const dim4 &size, const void *data)
-    {
-        return Array<T>(size, (const T * const) data, true);
-    }
+template<typename T>
+Array<T>
+createHostDataArray(const dim4 &size, const T * const data)
+{
+    return Array<T>(size, data, false);
+}
 
-    template<typename T>
-    Array<T>
-    createValueArray(const dim4 &size, const T& value)
-    {
-        TNJ::ScalarNode<T> *node = new TNJ::ScalarNode<T>(value);
-        return createNodeArray<T>(size, TNJ::Node_ptr(
-                                      reinterpret_cast<TNJ::Node *>(node)));
-    }
+template<typename T>
+Array<T>
+createDeviceDataArray(const dim4 &size, const void *data)
+{
+    return Array<T>(size, (const T * const) data, true);
+}
 
-    template<typename T>
-    Array<T>
-    createEmptyArray(const dim4 &size)
-    {
-        return Array<T>(size);
-    }
+template<typename T>
+Array<T>
+createValueArray(const dim4 &size, const T& value)
+{
+    TNJ::ScalarNode<T> *node = new TNJ::ScalarNode<T>(value);
+    return createNodeArray<T>(size, TNJ::Node_ptr(
+                                  reinterpret_cast<TNJ::Node *>(node)));
+}
 
-    template<typename T>
-    Array<T> *initArray() { return new Array<T>(dim4(0, 0, 0, 0)); }
+template<typename T>
+Array<T>
+createEmptyArray(const dim4 &size)
+{
+    return Array<T>(size);
+}
 
+template<typename T>
+Array<T> *initArray() { return new Array<T>(dim4(0, 0, 0, 0)); }
 
-    template<typename T>
-    Array<T>
-    createNodeArray(const dim4 &dims, Node_ptr node)
-    {
-        Array<T> out =  Array<T>(dims, node);
 
-        unsigned length =0, buf_count = 0, bytes = 0;
+template<typename T>
+Array<T>
+createNodeArray(const dim4 &dims, Node_ptr node)
+{
+    Array<T> out =  Array<T>(dims, node);
 
-        Node *n = node.get();
-        n->getInfo(length, buf_count, bytes);
-        n->reset();
+    unsigned length =0, buf_count = 0, bytes = 0;
 
-        if (length > MAX_TNJ_LEN ||
-            buf_count >= MAX_BUFFERS ||
-            bytes >= MAX_BYTES) {
-            out.eval();
-        }
+    Node *n = node.get();
+    n->getInfo(length, buf_count, bytes);
+    n->reset();
 
-        return out;
+    if (length > MAX_TNJ_LEN ||
+        buf_count >= MAX_BUFFERS ||
+        bytes >= MAX_BYTES) {
+        out.eval();
     }
 
+    return out;
+}
 
-    template<typename T>
-    Array<T> createSubArray(const Array<T>& parent,
-                            const std::vector<af_seq> &index,
-                            bool copy)
-    {
-        parent.eval();
 
-        dim4 dDims = parent.getDataDims();
-        dim4 pDims = parent.dims();
+template<typename T>
+Array<T> createSubArray(const Array<T>& parent,
+                        const std::vector<af_seq> &index,
+                        bool copy)
+{
+    parent.eval();
 
-        dim4 dims   = toDims  (index, pDims);
-        dim4 offset = toOffset(index, dDims);
-        dim4 stride = toStride (index, dDims);
+    dim4 dDims = parent.getDataDims();
+    dim4 pDims = parent.dims();
 
-        Array<T> out = Array<T>(parent, dims, offset, stride);
+    dim4 dims   = toDims  (index, pDims);
+    dim4 offset = toOffset(index, dDims);
+    dim4 stride = toStride (index, dDims);
 
-        if (!copy) return out;
+    Array<T> out = Array<T>(parent, dims, offset, stride);
 
-        if (stride[0] != 1 ||
-            stride[1] <  0 ||
-            stride[2] <  0 ||
-            stride[3] <  0) {
+    if (!copy) return out;
 
-            out = copyArray(out);
-        }
+    if (stride[0] != 1 ||
+        stride[1] <  0 ||
+        stride[2] <  0 ||
+        stride[3] <  0) {
 
-        return out;
+        out = copyArray(out);
     }
 
-    template<typename T>
-    void
-    destroyArray(Array<T> *A)
-    {
-        delete A;
-    }
+    return out;
+}
 
+template<typename T>
+void
+destroyArray(Array<T> *A)
+{
+    delete A;
+}
 
-    template<typename T>
-    void evalArray(const Array<T> &A)
-    {
-        A.eval();
-    }
 
-    template<typename T>
-    void
-    writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
-    {
-        if(!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
-        }
-        memcpy(arr.get() + arr.getOffset(), data, bytes);
+template<typename T>
+void evalArray(const Array<T> &A)
+{
+    A.eval();
+}
+
+template<typename T>
+void
+writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
+{
+    if(!arr.isOwner()) {
+        arr = createEmptyArray<T>(arr.dims());
     }
+    memcpy(arr.get() + arr.getOffset(), data, bytes);
+}
 
-    template<typename T>
-    void
-    writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
-    {
-        if(!arr.isOwner()) {
-            arr = createEmptyArray<T>(arr.dims());
-        }
-        memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes);
+template<typename T>
+void
+writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
+{
+    if(!arr.isOwner()) {
+        arr = createEmptyArray<T>(arr.dims());
     }
+    memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes);
+}
 
 #define INSTANTIATE(T)                                                  \
     template       Array<T>  createHostDataArray<T>   (const dim4 &size, const T * const data); \
@@ -286,16 +287,17 @@ namespace cpu
     template       void      writeHostDataArray<T>    (Array<T> &arr, const T * const data, const size_t bytes); \
     template       void      writeDeviceDataArray<T>  (Array<T> &arr, const void * const data, const size_t bytes); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp
index 4d3c880..7988863 100644
--- a/src/backend/cpu/approx.cpp
+++ b/src/backend/cpu/approx.cpp
@@ -17,330 +17,339 @@
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Approx1
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename Ty, typename Tp, af_interp_type method>
-    struct approx1_op
+
+///////////////////////////////////////////////////////////////////////////
+// Approx1
+///////////////////////////////////////////////////////////////////////////
+template<typename Ty, typename Tp, af_interp_type method>
+struct approx1_op
+{
+    void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+              const Ty *in,  const af::dim4 &idims, const dim_t iElems,
+              const Tp *pos, const af::dim4 &pdims,
+              const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
+              const float offGrid, const bool pBatch,
+              const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
     {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            return;
-        }
-    };
+        return;
+    }
+};
 
-    template<typename Ty, typename Tp>
-    struct approx1_op<Ty, Tp, AF_INTERP_NEAREST>
+template<typename Ty, typename Tp>
+struct approx1_op<Ty, Tp, AF_INTERP_NEAREST>
+{
+    void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+              const Ty *in,  const af::dim4 &idims, const dim_t iElems,
+              const Tp *pos, const af::dim4 &pdims,
+              const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
+              const float offGrid, const bool pBatch,
+              const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
     {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idx;
-            if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
-
-            const Tp x = pos[pmId];
-            bool gFlag = false;
-            if (x < 0 || idims[0] < x+1) {  // No need to check y
-                gFlag = true;
-            }
+        dim_t pmId = idx;
+        if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
 
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                dim_t ioff = idw * istrides[3] + idz * istrides[2]
-                           + idy * istrides[1];
-                const dim_t iMem = round(x) + ioff;
+        const Tp x = pos[pmId];
+        bool gFlag = false;
+        if (x < 0 || idims[0] < x+1) {  // No need to check y
+            gFlag = true;
+        }
 
-                out[omId] = in[iMem];
-            }
+        const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<Ty>(offGrid);
+        } else {
+            dim_t ioff = idw * istrides[3] + idz * istrides[2]
+                       + idy * istrides[1];
+            const dim_t iMem = round(x) + ioff;
+
+            out[omId] = in[iMem];
         }
-    };
+    }
+};
 
-    template<typename Ty, typename Tp>
-    struct approx1_op<Ty, Tp, AF_INTERP_LINEAR>
+template<typename Ty, typename Tp>
+struct approx1_op<Ty, Tp, AF_INTERP_LINEAR>
+{
+    void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+              const Ty *in,  const af::dim4 &idims, const dim_t iElems,
+              const Tp *pos, const af::dim4 &pdims,
+              const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
+              const float offGrid, const bool pBatch,
+              const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
     {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idx;
-            if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
-
-            const Tp x = pos[pmId];
-            bool gFlag = false;
-            if (x < 0 || idims[0] < x+1) {
-                gFlag = true;
-            }
+        dim_t pmId = idx;
+        if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
 
-            const dim_t grid_x = floor(x);  // nearest grid
-            const Tp off_x = x - grid_x; // fractional offset
-
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
-
-                // Check if x and x + 1 are both valid indices
-                bool cond = (x < idims[0] - 1);
-                // Compute Left and Right Weighted Values
-                Ty yl = ((Tp)1.0 - off_x) * in[ioff];
-                Ty yr = cond ? (off_x) * in[ioff + 1] : scalar<Ty>(0);
-                Ty yo = yl + yr;
-                // Compute Weight used
-                Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x);
-                // Write final value
-                out[omId] = (yo / wt);
-            }
+        const Tp x = pos[pmId];
+        bool gFlag = false;
+        if (x < 0 || idims[0] < x+1) {
+            gFlag = true;
         }
-    };
-
-    template<typename Ty, typename Tp, af_interp_type method>
-    void approx1_(Ty *out, const af::dim4 &odims, const dim_t oElems,
-            const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-            const Tp *pos, const af::dim4 &pdims,
-            const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
-            const float offGrid)
-    {
-        approx1_op<Ty, Tp, method> op;
-        bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1);
-
-        for(dim_t w = 0; w < odims[3]; w++) {
-            for(dim_t z = 0; z < odims[2]; z++) {
-                for(dim_t y = 0; y < odims[1]; y++) {
-                    for(dim_t x = 0; x < odims[0]; x++) {
-                        op(out, odims, oElems, in, idims, iElems, pos, pdims,
-                           ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w);
-                    }
+
+        const dim_t grid_x = floor(x);  // nearest grid
+        const Tp off_x = x - grid_x; // fractional offset
+
+        const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<Ty>(offGrid);
+        } else {
+            dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
+
+            // Check if x and x + 1 are both valid indices
+            bool cond = (x < idims[0] - 1);
+            // Compute Left and Right Weighted Values
+            Ty yl = ((Tp)1.0 - off_x) * in[ioff];
+            Ty yr = cond ? (off_x) * in[ioff + 1] : scalar<Ty>(0);
+            Ty yo = yl + yr;
+            // Compute Weight used
+            Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x);
+            // Write final value
+            out[omId] = (yo / wt);
+        }
+    }
+};
+
+template<typename Ty, typename Tp, af_interp_type method>
+void approx1_(Array<Ty> output, Array<Ty> const input,
+              Array<Tp> const position, float const offGrid)
+{
+    Ty * out = output.get();
+    Ty const * const in  = input.get();
+    Tp const * const pos = position.get();
+    dim4 const odims     = output.dims();
+    dim4 const idims     = input.dims();
+    dim4 const pdims     = position.dims();
+    dim4 const ostrides  = output.strides();
+    dim4 const istrides  = input.strides();
+    dim4 const pstrides  = position.strides();
+    dim_t const oElems   = output.elements();
+    dim_t const iElems   = input.elements();
+
+    approx1_op<Ty, Tp, method> op;
+    bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1);
+
+    for(dim_t w = 0; w < odims[3]; w++) {
+        for(dim_t z = 0; z < odims[2]; z++) {
+            for(dim_t y = 0; y < odims[1]; y++) {
+                for(dim_t x = 0; x < odims[0]; x++) {
+                    op(out, odims, oElems, in, idims, iElems, pos, pdims,
+                       ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w);
                 }
             }
         }
     }
+}
 
-    template<typename Ty, typename Tp>
-    Array<Ty> approx1(const Array<Ty> &in, const Array<Tp> &pos,
-                       const af_interp_type method, const float offGrid)
-    {
-        in.eval();
-        pos.eval();
-
-        af::dim4 odims = in.dims();
-        odims[0] = pos.dims()[0];
-
-        // Create output placeholder
-        Array<Ty> out = createEmptyArray<Ty>(odims);
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_NEAREST>,
-                        out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(), pos.get(), pos.dims(),
-                         out.strides(), in.strides(), pos.strides(), offGrid);
-                break;
-            case AF_INTERP_LINEAR:
-                getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_LINEAR>,
-                        out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(), pos.get(), pos.dims(),
-                         out.strides(), in.strides(), pos.strides(), offGrid);
-                break;
-            default:
-                break;
-        }
-        return out;
+template<typename Ty, typename Tp>
+Array<Ty> approx1(const Array<Ty> &in, const Array<Tp> &pos,
+                   const af_interp_type method, const float offGrid)
+{
+    in.eval();
+    pos.eval();
+
+    af::dim4 odims = in.dims();
+    odims[0] = pos.dims()[0];
+
+    // Create output placeholder
+    Array<Ty> out = createEmptyArray<Ty>(odims);
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_NEAREST>,
+                               out, in, pos, offGrid);
+            break;
+        case AF_INTERP_LINEAR:
+            getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_LINEAR>,
+                               out, in, pos, offGrid);
+            break;
+        default:
+            break;
     }
+    return out;
+}
 
-    ///////////////////////////////////////////////////////////////////////////
-    // Approx2
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename Ty, typename Tp, af_interp_type method>
-    struct approx2_op
+///////////////////////////////////////////////////////////////////////////
+// Approx2
+///////////////////////////////////////////////////////////////////////////
+template<typename Ty, typename Tp, af_interp_type method>
+struct approx2_op
+{
+    void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+              const Ty *in,  const af::dim4 &idims, const dim_t iElems,
+              const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
+              const af::dim4 &ostrides, const af::dim4 &istrides,
+              const af::dim4 &pstrides, const af::dim4 &qstrides,
+              const float offGrid, const bool pBatch,
+              const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
     {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            return;
-        }
-    };
+        return;
+    }
+};
 
-    template<typename Ty, typename Tp>
-    struct approx2_op<Ty, Tp, AF_INTERP_NEAREST>
+template<typename Ty, typename Tp>
+struct approx2_op<Ty, Tp, AF_INTERP_NEAREST>
+{
+    void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+              const Ty *in,  const af::dim4 &idims, const dim_t iElems,
+              const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
+              const af::dim4 &ostrides, const af::dim4 &istrides,
+              const af::dim4 &pstrides, const af::dim4 &qstrides,
+              const float offGrid, const bool pBatch,
+              const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
     {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idy * pstrides[1] + idx;
-            dim_t qmId = idy * qstrides[1] + idx;
-            if(pBatch) {
-                pmId += idw * pstrides[3] + idz * pstrides[2];
-                qmId += idw * qstrides[3] + idz * qstrides[2];
-            }
+        dim_t pmId = idy * pstrides[1] + idx;
+        dim_t qmId = idy * qstrides[1] + idx;
+        if(pBatch) {
+            pmId += idw * pstrides[3] + idz * pstrides[2];
+            qmId += idw * qstrides[3] + idz * qstrides[2];
+        }
 
-            bool gFlag = false;
-            const Tp x = pos[pmId], y = qos[qmId];
-            if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
-                gFlag = true;
-            }
+        bool gFlag = false;
+        const Tp x = pos[pmId], y = qos[qmId];
+        if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
+            gFlag = true;
+        }
 
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
-                const dim_t imId = idw * istrides[3] + idz * istrides[2] +
-                                grid_y * istrides[1] + grid_x;
-                out[omId] = in[imId];
-            }
+        const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<Ty>(offGrid);
+        } else {
+            const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
+            const dim_t imId = idw * istrides[3] + idz * istrides[2] +
+                            grid_y * istrides[1] + grid_x;
+            out[omId] = in[imId];
         }
-    };
+    }
+};
 
-    template<typename Ty, typename Tp>
-    struct approx2_op<Ty, Tp, AF_INTERP_LINEAR>
+template<typename Ty, typename Tp>
+struct approx2_op<Ty, Tp, AF_INTERP_LINEAR>
+{
+    void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+              const Ty *in,  const af::dim4 &idims, const dim_t iElems,
+              const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
+              const af::dim4 &ostrides, const af::dim4 &istrides,
+              const af::dim4 &pstrides, const af::dim4 &qstrides,
+              const float offGrid, const bool pBatch,
+              const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
     {
-        void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
-                  const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-                  const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-                  const af::dim4 &ostrides, const af::dim4 &istrides,
-                  const af::dim4 &pstrides, const af::dim4 &qstrides,
-                  const float offGrid, const bool pBatch,
-                  const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
-        {
-            dim_t pmId = idy * pstrides[1] + idx;
-            dim_t qmId = idy * qstrides[1] + idx;
-            if(pBatch) {
-                pmId += idw * pstrides[3] + idz * pstrides[2];
-                qmId += idw * qstrides[3] + idz * qstrides[2];
-            }
+        dim_t pmId = idy * pstrides[1] + idx;
+        dim_t qmId = idy * qstrides[1] + idx;
+        if(pBatch) {
+            pmId += idw * pstrides[3] + idz * pstrides[2];
+            qmId += idw * qstrides[3] + idz * qstrides[2];
+        }
 
-            bool gFlag = false;
-            const Tp x = pos[pmId], y = qos[qmId];
-            if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
-                gFlag = true;
-            }
+        bool gFlag = false;
+        const Tp x = pos[pmId], y = qos[qmId];
+        if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
+            gFlag = true;
+        }
 
-            const dim_t grid_x = floor(x),   grid_y = floor(y);   // nearest grid
-            const Tp off_x  = x - grid_x, off_y  = y - grid_y; // fractional offset
+        const dim_t grid_x = floor(x),   grid_y = floor(y);   // nearest grid
+        const Tp off_x  = x - grid_x, off_y  = y - grid_y; // fractional offset
 
-            // Check if pVal and pVal + 1 are both valid indices
-            bool condY = (y < idims[1] - 1);
-            bool condX = (x < idims[0] - 1);
+        // Check if pVal and pVal + 1 are both valid indices
+        bool condY = (y < idims[1] - 1);
+        bool condX = (x < idims[0] - 1);
 
-            // Compute wieghts used
-            Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y);
-            Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0;
-            Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0;
-            Tp wt11 = (condX && condY) ? (off_x) * (off_y)  : 0;
+        // Compute wieghts used
+        Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y);
+        Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0;
+        Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0;
+        Tp wt11 = (condX && condY) ? (off_x) * (off_y)  : 0;
 
-            Tp wt = wt00 + wt10 + wt01 + wt11;
-            Ty zero = scalar<Ty>(0);
+        Tp wt = wt00 + wt10 + wt01 + wt11;
+        Ty zero = scalar<Ty>(0);
 
-            const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
-                             + idy * ostrides[1] + idx;
-            if(gFlag) {
-                out[omId] = scalar<Ty>(offGrid);
-            } else {
-                dim_t ioff = idw * istrides[3] + idz * istrides[2]
-                        + grid_y * istrides[1] + grid_x;
+        const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+                         + idy * ostrides[1] + idx;
+        if(gFlag) {
+            out[omId] = scalar<Ty>(offGrid);
+        } else {
+            dim_t ioff = idw * istrides[3] + idz * istrides[2]
+                    + grid_y * istrides[1] + grid_x;
 
-                // Compute Weighted Values
-                Ty y00 =                    wt00 * in[ioff];
-                Ty y10 = (condY) ?          wt10 * in[ioff + istrides[1]]     : zero;
-                Ty y01 = (condX) ?          wt01 * in[ioff + 1]               : zero;
-                Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
+            // Compute Weighted Values
+            Ty y00 =                    wt00 * in[ioff];
+            Ty y10 = (condY) ?          wt10 * in[ioff + istrides[1]]     : zero;
+            Ty y01 = (condX) ?          wt01 * in[ioff + 1]               : zero;
+            Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
 
-                Ty yo = y00 + y10 + y01 + y11;
+            Ty yo = y00 + y10 + y01 + y11;
 
-                // Write Final Value
-                out[omId] = (yo / wt);
-            }
+            // Write Final Value
+            out[omId] = (yo / wt);
         }
-    };
-
-    template<typename Ty, typename Tp, af_interp_type method>
-    void approx2_(Ty *out, const af::dim4 &odims, const dim_t oElems,
-            const Ty *in,  const af::dim4 &idims, const dim_t iElems,
-            const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
-            const af::dim4 &ostrides, const af::dim4 &istrides,
-            const af::dim4 &pstrides, const af::dim4 &qstrides,
-            const float offGrid)
-    {
-        approx2_op<Ty, Tp, method> op;
-        bool pBatch = !(pdims[2] == 1 && pdims[3] == 1);
-
-        for(dim_t w = 0; w < odims[3]; w++) {
-            for(dim_t z = 0; z < odims[2]; z++) {
-                for(dim_t y = 0; y < odims[1]; y++) {
-                    for(dim_t x = 0; x < odims[0]; x++) {
-                        op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
-                           ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w);
-                    }
+    }
+};
+
+template<typename Ty, typename Tp, af_interp_type method>
+void approx2_(Array<Ty> output, Array<Ty> const input,
+              Array<Tp> const position, Array<Tp> const qosition,
+              float const offGrid)
+{
+    Ty * out = output.get();
+    Ty const * const in  = input.get();
+    Tp const * const pos = position.get();
+    Tp const * const qos = qosition.get();
+    dim4 const odims     = output.dims();
+    dim4 const idims     = input.dims();
+    dim4 const pdims     = position.dims();
+    dim4 const qdims     = qosition.dims();
+    dim4 const ostrides  = output.strides();
+    dim4 const istrides  = input.strides();
+    dim4 const pstrides  = position.strides();
+    dim4 const qstrides  = qosition.strides();
+    dim_t const oElems   = output.elements();
+    dim_t const iElems   = input.elements();
+
+    approx2_op<Ty, Tp, method> op;
+    bool pBatch = !(pdims[2] == 1 && pdims[3] == 1);
+
+    for(dim_t w = 0; w < odims[3]; w++) {
+        for(dim_t z = 0; z < odims[2]; z++) {
+            for(dim_t y = 0; y < odims[1]; y++) {
+                for(dim_t x = 0; x < odims[0]; x++) {
+                    op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
+                       ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w);
                 }
             }
         }
     }
+}
 
-    template<typename Ty, typename Tp>
-    Array<Ty> approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1,
-                       const af_interp_type method, const float offGrid)
-    {
-        in.eval();
-        pos0.eval();
-        pos1.eval();
-
-        af::dim4 odims = in.dims();
-        odims[0] = pos0.dims()[0];
-        odims[1] = pos0.dims()[1];
-
-        // Create output placeholder
-        Array<Ty> out = createEmptyArray<Ty>(odims);
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_NEAREST>,
-                        out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(),
-                         pos0.get(), pos0.dims(), pos1.get(), pos1.dims(),
-                         out.strides(), in.strides(), pos0.strides(), pos1.strides(),
-                         offGrid);
-                break;
-            case AF_INTERP_LINEAR:
-                getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_LINEAR>,
-                        out.get(), out.dims(), out.elements(),
-                         in.get(), in.dims(), in.elements(),
-                         pos0.get(), pos0.dims(), pos1.get(), pos1.dims(),
-                         out.strides(), in.strides(), pos0.strides(), pos1.strides(),
-                         offGrid);
-                break;
-            default:
-                break;
-        }
-        return out;
+template<typename Ty, typename Tp>
+Array<Ty> approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1,
+                   const af_interp_type method, const float offGrid)
+{
+    in.eval();
+    pos0.eval();
+    pos1.eval();
+
+    af::dim4 odims = in.dims();
+    odims[0] = pos0.dims()[0];
+    odims[1] = pos0.dims()[1];
+
+    Array<Ty> out = createEmptyArray<Ty>(odims);
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_NEAREST>,
+                               out, in, pos0, pos1, offGrid);
+            break;
+        case AF_INTERP_LINEAR:
+            getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_LINEAR>,
+                               out, in, pos0, pos1, offGrid);
+            break;
+        default:
+            break;
     }
+    return out;
+}
 
 #define INSTANTIATE(Ty, Tp)                                                                    \
     template Array<Ty> approx1<Ty, Tp>(const Array<Ty> &in, const Array<Tp> &pos,              \
@@ -349,8 +358,9 @@ namespace cpu
                                        const Array<Tp> &pos1, const af_interp_type method,     \
                                        const float offGrid);                                   \
 
-    INSTANTIATE(float  , float )
-    INSTANTIATE(double , double)
-    INSTANTIATE(cfloat , float )
-    INSTANTIATE(cdouble, double)
+INSTANTIATE(float  , float )
+INSTANTIATE(double , double)
+INSTANTIATE(cfloat , float )
+INSTANTIATE(cdouble, double)
+
 }
diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp
index 10856f7..ea38ea7 100644
--- a/src/backend/cpu/bilateral.cpp
+++ b/src/backend/cpu/bilateral.cpp
@@ -99,6 +99,7 @@ void bilateral_(Array<outType> out, const Array<inType> in, float s_sigma, float
 template<typename inType, typename outType, bool isColor>
 Array<outType> bilateral(const Array<inType> &in, const float &s_sigma, const float &c_sigma)
 {
+    in.eval();
     const dim4 dims     = in.dims();
     Array<outType> out = createEmptyArray<outType>(dims);
     getQueue().enqueue(bilateral_<inType, outType, isColor>, out, in, s_sigma, c_sigma);
diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp
index d0bd3c8..ce11867 100644
--- a/src/backend/cpu/cholesky.cpp
+++ b/src/backend/cpu/cholesky.cpp
@@ -47,6 +47,8 @@ CH_FUNC(potrf , cdouble, z)
 template<typename T>
 Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
 {
+    in.eval();
+
     Array<T> out = copyArray<T>(in);
     *info = cholesky_inplace(out, is_upper);
 
@@ -59,6 +61,8 @@ Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
 template<typename T>
 int cholesky_inplace(Array<T> &in, const bool is_upper)
 {
+    in.eval();
+
     dim4 iDims = in.dims();
     int N = iDims[0];
 
diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp
index 5240360..eef5e0e 100644
--- a/src/backend/cpu/copy.cpp
+++ b/src/backend/cpu/copy.cpp
@@ -23,144 +23,144 @@
 
 namespace cpu
 {
-    template<typename T>
-    static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim)
-    {
-        if(dim == 0) {
-            if(strides[dim] == 1) {
-                //FIXME: Check for errors / exceptions
-                memcpy(dst, src, dims[dim] * sizeof(T));
-            } else {
-                for(dim_t i = 0; i < dims[dim]; i++) {
-                    dst[i] = src[strides[dim]*i];
-                }
-            }
+
+template<typename T>
+static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim)
+{
+    if(dim == 0) {
+        if(strides[dim] == 1) {
+            //FIXME: Check for errors / exceptions
+            memcpy(dst, src, dims[dim] * sizeof(T));
         } else {
-            for(dim_t i = dims[dim]; i > 0; i--) {
-                stridedCopy<T>(dst, ostrides, src, dims, strides, dim - 1);
-                src += strides[dim];
-                dst += ostrides[dim];
+            for(dim_t i = 0; i < dims[dim]; i++) {
+                dst[i] = src[strides[dim]*i];
             }
         }
-    }
-
-    // Assigns to single elements
-    template<typename T>
-    void copyData(T *to, const Array<T> &from)
-    {
-        from.eval();
-        getQueue().sync();
-        if(from.isOwner()) {
-            // FIXME: Check for errors / exceptions
-            memcpy(to, from.get(), from.elements()*sizeof(T));
-        } else {
-            dim4 ostrides = calcStrides(from.dims());
-            stridedCopy<T>(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1);
+    } else {
+        for(dim_t i = dims[dim]; i > 0; i--) {
+            stridedCopy<T>(dst, ostrides, src, dims, strides, dim - 1);
+            src += strides[dim];
+            dst += ostrides[dim];
         }
     }
+}
 
-    template<typename T>
-    Array<T> copyArray(const Array<T> &A)
-    {
-        Array<T> out = createEmptyArray<T>(A.dims());
-        copyData(out.get(), A);
-        return out;
+// Assigns to single elements
+template<typename T>
+void copyData(T *to, const Array<T> &from)
+{
+    from.eval();
+    getQueue().sync();
+    if(from.isOwner()) {
+        // FIXME: Check for errors / exceptions
+        memcpy(to, from.get(), from.elements()*sizeof(T));
+    } else {
+        dim4 ostrides = calcStrides(from.dims());
+        stridedCopy<T>(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1);
     }
+}
 
-    template<typename inType, typename outType>
-    static void copy(Array<outType> dst, const Array<inType> src, outType default_value, double factor)
-    {
-        dim4 src_dims       = src.dims();
-        dim4 dst_dims       = dst.dims();
-        dim4 src_strides    = src.strides();
-        dim4 dst_strides    = dst.strides();
+template<typename T>
+Array<T> copyArray(const Array<T> &A)
+{
+    Array<T> out = createEmptyArray<T>(A.dims());
+    copyData(out.get(), A);
+    return out;
+}
 
-        const inType * src_ptr = src.get();
-        outType * dst_ptr      = dst.get();
+template<typename inType, typename outType>
+static void copy(Array<outType> dst, const Array<inType> src, outType default_value, double factor)
+{
+    dim4 src_dims       = src.dims();
+    dim4 dst_dims       = dst.dims();
+    dim4 src_strides    = src.strides();
+    dim4 dst_strides    = dst.strides();
 
-        dim_t trgt_l = std::min(dst_dims[3], src_dims[3]);
-        dim_t trgt_k = std::min(dst_dims[2], src_dims[2]);
-        dim_t trgt_j = std::min(dst_dims[1], src_dims[1]);
-        dim_t trgt_i = std::min(dst_dims[0], src_dims[0]);
+    const inType * src_ptr = src.get();
+    outType * dst_ptr      = dst.get();
 
-        for(dim_t l=0; l<dst_dims[3]; ++l) {
+    dim_t trgt_l = std::min(dst_dims[3], src_dims[3]);
+    dim_t trgt_k = std::min(dst_dims[2], src_dims[2]);
+    dim_t trgt_j = std::min(dst_dims[1], src_dims[1]);
+    dim_t trgt_i = std::min(dst_dims[0], src_dims[0]);
 
-            dim_t src_loff = l*src_strides[3];
-            dim_t dst_loff = l*dst_strides[3];
-            bool isLvalid = l<trgt_l;
+    for(dim_t l=0; l<dst_dims[3]; ++l) {
 
-            for(dim_t k=0; k<dst_dims[2]; ++k) {
+        dim_t src_loff = l*src_strides[3];
+        dim_t dst_loff = l*dst_strides[3];
+        bool isLvalid = l<trgt_l;
 
-                dim_t src_koff = k*src_strides[2];
-                dim_t dst_koff = k*dst_strides[2];
-                bool isKvalid = k<trgt_k;
+        for(dim_t k=0; k<dst_dims[2]; ++k) {
 
-                for(dim_t j=0; j<dst_dims[1]; ++j) {
+            dim_t src_koff = k*src_strides[2];
+            dim_t dst_koff = k*dst_strides[2];
+            bool isKvalid = k<trgt_k;
 
-                    dim_t src_joff = j*src_strides[1];
-                    dim_t dst_joff = j*dst_strides[1];
-                    bool isJvalid = j<trgt_j;
+            for(dim_t j=0; j<dst_dims[1]; ++j) {
 
-                    for(dim_t i=0; i<dst_dims[0]; ++i) {
-                        outType temp = default_value;
-                        if (isLvalid && isKvalid && isJvalid && i<trgt_i) {
-                            dim_t src_idx = i*src_strides[0] + src_joff + src_koff + src_loff;
-                            temp = outType(src_ptr[src_idx])*outType(factor);
-                        }
-                        dim_t dst_idx = i*dst_strides[0] + dst_joff + dst_koff + dst_loff;
-                        dst_ptr[dst_idx] = temp;
+                dim_t src_joff = j*src_strides[1];
+                dim_t dst_joff = j*dst_strides[1];
+                bool isJvalid = j<trgt_j;
+
+                for(dim_t i=0; i<dst_dims[0]; ++i) {
+                    outType temp = default_value;
+                    if (isLvalid && isKvalid && isJvalid && i<trgt_i) {
+                        dim_t src_idx = i*src_strides[0] + src_joff + src_koff + src_loff;
+                        temp = outType(src_ptr[src_idx])*outType(factor);
                     }
+                    dim_t dst_idx = i*dst_strides[0] + dst_joff + dst_koff + dst_loff;
+                    dst_ptr[dst_idx] = temp;
                 }
             }
         }
     }
+}
 
-    template<typename T>
-    void multiply_inplace(Array<T> &in, double val)
-    {
-        in.eval();
-        getQueue().enqueue(copy<T, T>, in, in, 0, val);
-    }
-
-    template<typename inType, typename outType>
-    Array<outType> padArray(Array<inType> const &in, dim4 const &dims,
-                            outType default_value, double factor)
-    {
-        Array<outType> ret = createValueArray<outType>(dims, default_value);
-        ret.eval();
-        in.eval();
-        // FIXME:
-        getQueue().sync();
-        getQueue().enqueue(copy<inType, outType>, ret, in, outType(default_value), factor);
-        return ret;
-    }
+template<typename T>
+void multiply_inplace(Array<T> &in, double val)
+{
+    in.eval();
+    getQueue().enqueue(copy<T, T>, in, in, 0, val);
+}
 
-    template<typename inType, typename outType>
-    void copyArray(Array<outType> &out, Array<inType> const &in)
-    {
-        out.eval();
-        in.eval();
-        getQueue().enqueue(copy<inType, outType>, out, in, scalar<outType>(0), 1.0);
-    }
+template<typename inType, typename outType>
+Array<outType> padArray(Array<inType> const &in, dim4 const &dims,
+                        outType default_value, double factor)
+{
+    Array<outType> ret = createValueArray<outType>(dims, default_value);
+    ret.eval();
+    in.eval();
+    // FIXME:
+    getQueue().sync();
+    getQueue().enqueue(copy<inType, outType>, ret, in, outType(default_value), factor);
+    return ret;
+}
 
+template<typename inType, typename outType>
+void copyArray(Array<outType> &out, Array<inType> const &in)
+{
+    out.eval();
+    in.eval();
+    getQueue().enqueue(copy<inType, outType>, out, in, scalar<outType>(0), 1.0);
+}
 
 #define INSTANTIATE(T)                                                  \
     template void      copyData<T> (T *data, const Array<T> &from);     \
     template Array<T>  copyArray<T>(const Array<T> &A);                 \
     template void      multiply_inplace<T> (Array<T> &in, double norm); \
 
-    INSTANTIATE(float  )
-    INSTANTIATE(double )
-    INSTANTIATE(cfloat )
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int    )
-    INSTANTIATE(uint   )
-    INSTANTIATE(uchar  )
-    INSTANTIATE(char   )
-    INSTANTIATE(intl   )
-    INSTANTIATE(uintl  )
-    INSTANTIATE(short  )
-    INSTANTIATE(ushort )
+INSTANTIATE(float  )
+INSTANTIATE(double )
+INSTANTIATE(cfloat )
+INSTANTIATE(cdouble)
+INSTANTIATE(int    )
+INSTANTIATE(uint   )
+INSTANTIATE(uchar  )
+INSTANTIATE(char   )
+INSTANTIATE(intl   )
+INSTANTIATE(uintl  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 
 #define INSTANTIATE_PAD_ARRAY(SRC_T)                                    \
@@ -189,16 +189,16 @@ namespace cpu
     template void copyArray<SRC_T, uchar  >(Array<uchar  > &dst, Array<SRC_T> const &src);  \
     template void copyArray<SRC_T, char   >(Array<char   > &dst, Array<SRC_T> const &src);
 
-    INSTANTIATE_PAD_ARRAY(float )
-    INSTANTIATE_PAD_ARRAY(double)
-    INSTANTIATE_PAD_ARRAY(int   )
-    INSTANTIATE_PAD_ARRAY(uint  )
-    INSTANTIATE_PAD_ARRAY(intl  )
-    INSTANTIATE_PAD_ARRAY(uintl )
-    INSTANTIATE_PAD_ARRAY(uchar )
-    INSTANTIATE_PAD_ARRAY(char  )
-    INSTANTIATE_PAD_ARRAY(ushort)
-    INSTANTIATE_PAD_ARRAY(short )
+INSTANTIATE_PAD_ARRAY(float )
+INSTANTIATE_PAD_ARRAY(double)
+INSTANTIATE_PAD_ARRAY(int   )
+INSTANTIATE_PAD_ARRAY(uint  )
+INSTANTIATE_PAD_ARRAY(intl  )
+INSTANTIATE_PAD_ARRAY(uintl )
+INSTANTIATE_PAD_ARRAY(uchar )
+INSTANTIATE_PAD_ARRAY(char  )
+INSTANTIATE_PAD_ARRAY(ushort)
+INSTANTIATE_PAD_ARRAY(short )
 
 #define INSTANTIATE_PAD_ARRAY_COMPLEX(SRC_T)                            \
     template Array<cfloat > padArray<SRC_T, cfloat >(Array<SRC_T> const &src, dim4 const &dims, cfloat  default_value, double factor); \
@@ -206,8 +206,8 @@ namespace cpu
     template void copyArray<SRC_T, cfloat  >(Array<cfloat  > &dst, Array<SRC_T> const &src);    \
     template void copyArray<SRC_T, cdouble   >(Array<cdouble > &dst, Array<SRC_T> const &src);
 
-    INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat )
-    INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble)
+INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat )
+INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble)
 
 #define SPECILIAZE_UNUSED_COPYARRAY(SRC_T, DST_T) \
     template<> void copyArray<SRC_T, DST_T>(Array<DST_T> &out, Array<SRC_T> const &in) \
@@ -215,25 +215,25 @@ namespace cpu
         CPU_NOT_SUPPORTED();\
     }
 
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , double)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , float)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , char)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , int)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , short)
-    SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, char)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
-    SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , double)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , float)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , char)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , int)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , short)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, char)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
 
 }
diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp
index 856ed6e..9af7845 100644
--- a/src/backend/cpu/diagonal.cpp
+++ b/src/backend/cpu/diagonal.cpp
@@ -20,87 +20,88 @@
 
 namespace cpu
 {
-    template<typename T>
-    Array<T> diagCreate(const Array<T> &in, const int num)
-    {
-        in.eval();
-
-        int size = in.dims()[0] + std::abs(num);
-        int batch = in.dims()[1];
-        Array<T> out = createEmptyArray<T>(dim4(size, size, batch));
-
-        auto func = [=] (Array<T> out, const Array<T> in) {
-            const T *iptr = in.get();
-                  T *optr = out.get();
-
-            for (int k = 0; k < batch; k++) {
-                for (int j = 0; j < size; j++) {
-                    for (int i = 0; i < size; i++) {
-                        T val = scalar<T>(0);
-                        if (i == j - num) {
-                            val = (num > 0) ? iptr[i] : iptr[j];
-                        }
-                        optr[i + j * out.strides()[1]] = val;
+
+template<typename T>
+Array<T> diagCreate(const Array<T> &in, const int num)
+{
+    in.eval();
+
+    int size = in.dims()[0] + std::abs(num);
+    int batch = in.dims()[1];
+    Array<T> out = createEmptyArray<T>(dim4(size, size, batch));
+
+    auto func = [=] (Array<T> out, const Array<T> in) {
+        const T *iptr = in.get();
+              T *optr = out.get();
+
+        for (int k = 0; k < batch; k++) {
+            for (int j = 0; j < size; j++) {
+                for (int i = 0; i < size; i++) {
+                    T val = scalar<T>(0);
+                    if (i == j - num) {
+                        val = (num > 0) ? iptr[i] : iptr[j];
                     }
+                    optr[i + j * out.strides()[1]] = val;
                 }
-                optr += out.strides()[2];
-                iptr += in.strides()[1];
             }
-        };
-        getQueue().enqueue(func, out, in);
+            optr += out.strides()[2];
+            iptr += in.strides()[1];
+        }
+    };
+    getQueue().enqueue(func, out, in);
 
-        return out;
-    }
+    return out;
+}
 
-    template<typename T>
-    Array<T> diagExtract(const Array<T> &in, const int num)
-    {
-        in.eval();
+template<typename T>
+Array<T> diagExtract(const Array<T> &in, const int num)
+{
+    in.eval();
 
-        const dim4 idims = in.dims();
-        dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
-        Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));
+    const dim4 idims = in.dims();
+    dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
+    Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));
 
-        auto func = [=] (Array<T> out, const Array<T> in) {
-            const dim4 odims = out.dims();
+    auto func = [=] (Array<T> out, const Array<T> in) {
+        const dim4 odims = out.dims();
 
-            const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num);
+        const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num);
 
-            for (int l = 0; l < (int)odims[3]; l++) {
+        for (int l = 0; l < (int)odims[3]; l++) {
 
-                for (int k = 0; k < (int)odims[2]; k++) {
-                    const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off;
-                    T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2];
+            for (int k = 0; k < (int)odims[2]; k++) {
+                const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off;
+                T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2];
 
-                    for (int i = 0; i < (int)odims[0]; i++) {
-                        T val = scalar<T>(0);
-                        if (i < idims[0] && i < idims[1]) val =  iptr[i * in.strides()[1] + i];
-                        optr[i] = val;
-                    }
+                for (int i = 0; i < (int)odims[0]; i++) {
+                    T val = scalar<T>(0);
+                    if (i < idims[0] && i < idims[1]) val =  iptr[i * in.strides()[1] + i];
+                    optr[i] = val;
                 }
             }
-        };
+        }
+    };
 
-        getQueue().enqueue(func, out, in);
+    getQueue().enqueue(func, out, in);
 
-        return out;
-    }
+    return out;
+}
 
 #define INSTANTIATE_DIAGONAL(T)                                          \
     template Array<T>  diagExtract<T>    (const Array<T> &in, const int num); \
     template Array<T>  diagCreate <T>    (const Array<T> &in, const int num);
 
-    INSTANTIATE_DIAGONAL(float)
-    INSTANTIATE_DIAGONAL(double)
-    INSTANTIATE_DIAGONAL(cfloat)
-    INSTANTIATE_DIAGONAL(cdouble)
-    INSTANTIATE_DIAGONAL(int)
-    INSTANTIATE_DIAGONAL(uint)
-    INSTANTIATE_DIAGONAL(intl)
-    INSTANTIATE_DIAGONAL(uintl)
-    INSTANTIATE_DIAGONAL(char)
-    INSTANTIATE_DIAGONAL(uchar)
-    INSTANTIATE_DIAGONAL(short)
-    INSTANTIATE_DIAGONAL(ushort)
+INSTANTIATE_DIAGONAL(float)
+INSTANTIATE_DIAGONAL(double)
+INSTANTIATE_DIAGONAL(cfloat)
+INSTANTIATE_DIAGONAL(cdouble)
+INSTANTIATE_DIAGONAL(int)
+INSTANTIATE_DIAGONAL(uint)
+INSTANTIATE_DIAGONAL(intl)
+INSTANTIATE_DIAGONAL(uintl)
+INSTANTIATE_DIAGONAL(char)
+INSTANTIATE_DIAGONAL(uchar)
+INSTANTIATE_DIAGONAL(short)
+INSTANTIATE_DIAGONAL(ushort)
 
 }
diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp
index 321dae7..8f9c0f1 100644
--- a/src/backend/cpu/diff.cpp
+++ b/src/backend/cpu/diff.cpp
@@ -16,119 +16,122 @@
 
 namespace cpu
 {
-    unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0)
-    {
-        return (l * strides[3] +
-                k * strides[2] +
-                j * strides[1] +
-                i);
-    }
-
-    template<typename T>
-    Array<T>  diff1(const Array<T> &in, const int dim)
-    {
-        // Bool for dimension
-        bool is_dim0 = dim == 0;
-        bool is_dim1 = dim == 1;
-        bool is_dim2 = dim == 2;
-        bool is_dim3 = dim == 3;
-
-        // Decrement dimension of select dimension
-        af::dim4 dims = in.dims();
-        dims[dim]--;
-
-        // Create output placeholder
-        Array<T> outArray = createEmptyArray<T>(dims);
-
-        auto func = [=] (Array<T> outArray, Array<T> in) {
-            // Get pointers to raw data
-            const T *inPtr = in.get();
-                  T *outPtr = outArray.get();
-
-            // TODO: Improve this
-            for(dim_t l = 0; l < dims[3]; l++) {
-                for(dim_t k = 0; k < dims[2]; k++) {
-                    for(dim_t j = 0; j < dims[1]; j++) {
-                        for(dim_t i = 0; i < dims[0]; i++) {
-                            // Operation: out[index] = in[index + 1 * dim_size] - in[index]
-                            int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
-                            int jdx = getIdx(in.strides(), in.offsets(),
-                                            i + is_dim0, j + is_dim1,
-                                            k + is_dim2, l + is_dim3);
-                            int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
-                            outPtr[odx] = inPtr[jdx] - inPtr[idx];
-                        }
+
+unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0)
+{
+    return (l * strides[3] +
+            k * strides[2] +
+            j * strides[1] +
+            i);
+}
+
+template<typename T>
+Array<T>  diff1(const Array<T> &in, const int dim)
+{
+    in.eval();
+    // Bool for dimension
+    bool is_dim0 = dim == 0;
+    bool is_dim1 = dim == 1;
+    bool is_dim2 = dim == 2;
+    bool is_dim3 = dim == 3;
+
+    // Decrement dimension of select dimension
+    af::dim4 dims = in.dims();
+    dims[dim]--;
+
+    // Create output placeholder
+    Array<T> outArray = createEmptyArray<T>(dims);
+
+    auto func = [=] (Array<T> outArray, Array<T> in) {
+        // Get pointers to raw data
+        const T *inPtr = in.get();
+              T *outPtr = outArray.get();
+
+        // TODO: Improve this
+        for(dim_t l = 0; l < dims[3]; l++) {
+            for(dim_t k = 0; k < dims[2]; k++) {
+                for(dim_t j = 0; j < dims[1]; j++) {
+                    for(dim_t i = 0; i < dims[0]; i++) {
+                        // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+                        int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
+                        int jdx = getIdx(in.strides(), in.offsets(),
+                                        i + is_dim0, j + is_dim1,
+                                        k + is_dim2, l + is_dim3);
+                        int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
+                        outPtr[odx] = inPtr[jdx] - inPtr[idx];
                     }
                 }
             }
-        };
-        getQueue().enqueue(func, outArray, in);
-
-        return outArray;
-    }
-
-    template<typename T>
-    Array<T>  diff2(const Array<T> &in, const int dim)
-    {
-        // Bool for dimension
-        bool is_dim0 = dim == 0;
-        bool is_dim1 = dim == 1;
-        bool is_dim2 = dim == 2;
-        bool is_dim3 = dim == 3;
-
-        // Decrement dimension of select dimension
-        af::dim4 dims = in.dims();
-        dims[dim] -= 2;
-
-        // Create output placeholder
-        Array<T> outArray = createEmptyArray<T>(dims);
-
-        auto func = [=] (Array<T> outArray, Array<T> in) {
-            // Get pointers to raw data
-            const T *inPtr = in.get();
-                T *outPtr = outArray.get();
-
-            // TODO: Improve this
-            for(dim_t l = 0; l < dims[3]; l++) {
-                for(dim_t k = 0; k < dims[2]; k++) {
-                    for(dim_t j = 0; j < dims[1]; j++) {
-                        for(dim_t i = 0; i < dims[0]; i++) {
-                            // Operation: out[index] = in[index + 1 * dim_size] - in[index]
-                            int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
-                            int jdx = getIdx(in.strides(), in.offsets(),
-                                            i + is_dim0, j + is_dim1,
-                                            k + is_dim2, l + is_dim3);
-                            int kdx = getIdx(in.strides(), in.offsets(),
-                                            i + 2 * is_dim0, j + 2 * is_dim1,
-                                            k + 2 * is_dim2, l + 2 * is_dim3);
-                            int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
-                            outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
-                        }
+        }
+    };
+    getQueue().enqueue(func, outArray, in);
+
+    return outArray;
+}
+
+template<typename T>
+Array<T>  diff2(const Array<T> &in, const int dim)
+{
+    in.eval();
+    // Bool for dimension
+    bool is_dim0 = dim == 0;
+    bool is_dim1 = dim == 1;
+    bool is_dim2 = dim == 2;
+    bool is_dim3 = dim == 3;
+
+    // Decrement dimension of select dimension
+    af::dim4 dims = in.dims();
+    dims[dim] -= 2;
+
+    // Create output placeholder
+    Array<T> outArray = createEmptyArray<T>(dims);
+
+    auto func = [=] (Array<T> outArray, Array<T> in) {
+        // Get pointers to raw data
+        const T *inPtr = in.get();
+            T *outPtr = outArray.get();
+
+        // TODO: Improve this
+        for(dim_t l = 0; l < dims[3]; l++) {
+            for(dim_t k = 0; k < dims[2]; k++) {
+                for(dim_t j = 0; j < dims[1]; j++) {
+                    for(dim_t i = 0; i < dims[0]; i++) {
+                        // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+                        int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
+                        int jdx = getIdx(in.strides(), in.offsets(),
+                                        i + is_dim0, j + is_dim1,
+                                        k + is_dim2, l + is_dim3);
+                        int kdx = getIdx(in.strides(), in.offsets(),
+                                        i + 2 * is_dim0, j + 2 * is_dim1,
+                                        k + 2 * is_dim2, l + 2 * is_dim3);
+                        int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
+                        outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
                     }
                 }
             }
-        };
+        }
+    };
 
-        getQueue().enqueue(func, outArray, in);
+    getQueue().enqueue(func, outArray, in);
 
-        return outArray;
-    }
+    return outArray;
+}
 
 #define INSTANTIATE(T)                                                  \
     template Array<T>  diff1<T>  (const Array<T> &in, const int dim);   \
     template Array<T>  diff2<T>  (const Array<T> &in, const int dim);   \
 
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(ushort)
-    INSTANTIATE(short)
 }
diff --git a/src/backend/cpu/exampleFunction.cpp b/src/backend/cpu/exampleFunction.cpp
index a9e7bca..d45b8a2 100644
--- a/src/backend/cpu/exampleFunction.cpp
+++ b/src/backend/cpu/exampleFunction.cpp
@@ -24,6 +24,13 @@ namespace cpu
 template<typename T>
 Array<T> exampleFunction(const Array<T> &in, const af_someenum_t method)
 {
+    in.eval();                          // All input Arrays should call eval mandatorily
+                                        // in CPU backend function implementations. Since
+                                        // the cpu fns are asynchronous launches, any Arrays
+                                        // that are either views/JIT nodes needs to evaluated
+                                        // before they are passed onto functions that are
+                                        // enqueued onto the queues.
+
     dim4 outputDims;                    // this should be '= in.dims();' in most cases
                                         // but would definitely depend on the type of
                                         // algorithm you are implementing.
diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp
index 7262e6d..e522954 100644
--- a/src/backend/cpu/fft.cpp
+++ b/src/backend/cpu/fft.cpp
@@ -95,6 +95,7 @@ void fft_inplace_(Array<T> in)
 template<typename T, int rank, bool direction>
 void fft_inplace(Array<T> &in)
 {
+    in.eval();
     getQueue().enqueue(fft_inplace_<T, rank, direction>, in);
 }
 
@@ -165,6 +166,8 @@ void fft_r2c_(Array<Tc> out, const Array<Tr> in)
 template<typename Tc, typename Tr, int rank>
 Array<Tc> fft_r2c(const Array<Tr> &in)
 {
+    in.eval();
+
     dim4 odims = in.dims();
     odims[0] = odims[0] / 2 + 1;
     Array<Tc> out = createEmptyArray<Tc>(odims);
@@ -216,6 +219,8 @@ void fft_c2r_(Array<Tr> out, const Array<Tc> in, const dim4 odims)
 template<typename Tr, typename Tc, int rank>
 Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
 {
+    in.eval();
+
     Array<Tr> out = createEmptyArray<Tr>(odims);
     getQueue().enqueue(fft_c2r_<Tr, Tc, rank>, out, in, odims);
 
@@ -243,4 +248,5 @@ Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
 
     INSTANTIATE_REAL(float , cfloat )
     INSTANTIATE_REAL(double, cdouble)
+
 }
diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp
index 504c02a..06c15cf 100644
--- a/src/backend/cpu/gradient.cpp
+++ b/src/backend/cpu/gradient.cpp
@@ -101,4 +101,5 @@ INSTANTIATE(float)
 INSTANTIATE(double)
 INSTANTIATE(cfloat)
 INSTANTIATE(cdouble)
+
 }
diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp
index f7236bd..55c4417 100644
--- a/src/backend/cpu/identity.cpp
+++ b/src/backend/cpu/identity.cpp
@@ -18,6 +18,7 @@
 
 namespace cpu
 {
+
 template<typename T>
 Array<T> identity(const dim4& dims)
 {
@@ -56,4 +57,5 @@ INSTANTIATE_IDENTITY(char)
 INSTANTIATE_IDENTITY(uchar)
 INSTANTIATE_IDENTITY(short)
 INSTANTIATE_IDENTITY(ushort)
+
 }
diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp
index 170b6a1..dcb85fa 100644
--- a/src/backend/cpu/iota.cpp
+++ b/src/backend/cpu/iota.cpp
@@ -21,6 +21,7 @@ using namespace std;
 
 namespace cpu
 {
+
 ///////////////////////////////////////////////////////////////////////////
 // Kernel Functions
 ///////////////////////////////////////////////////////////////////////////
@@ -76,4 +77,5 @@ INSTANTIATE(uintl)
 INSTANTIATE(uchar)
 INSTANTIATE(short)
 INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp
index e562bae..9858cba 100644
--- a/src/backend/cpu/ireduce.cpp
+++ b/src/backend/cpu/ireduce.cpp
@@ -21,178 +21,180 @@ using af::dim4;
 
 namespace cpu
 {
-    template<typename T> double cabs(const T in) { return (double)in; }
-    static double cabs(const char in) { return (double)(in > 0); }
-    static double cabs(const cfloat &in) { return (double)abs(in); }
-    static double cabs(const cdouble &in) { return (double)abs(in); }
 
-    template<af_op_t op, typename T>
-    struct MinMaxOp
+template<typename T> double cabs(const T in) { return (double)in; }
+static double cabs(const char in) { return (double)(in > 0); }
+static double cabs(const cfloat &in) { return (double)abs(in); }
+static double cabs(const cdouble &in) { return (double)abs(in); }
+
+template<af_op_t op, typename T>
+struct MinMaxOp
+{
+    T m_val;
+    uint m_idx;
+    MinMaxOp(T val, uint idx) :
+        m_val(val), m_idx(idx)
     {
-        T m_val;
-        uint m_idx;
-        MinMaxOp(T val, uint idx) :
-            m_val(val), m_idx(idx)
-        {
-        }
+    }
 
-        void operator()(T val, uint idx)
-        {
-            if (cabs(val) < cabs(m_val) ||
-                (cabs(val) == cabs(m_val) &&
-                 idx > m_idx)) {
-                m_val = val;
-                m_idx = idx;
-            }
+    void operator()(T val, uint idx)
+    {
+        if (cabs(val) < cabs(m_val) ||
+            (cabs(val) == cabs(m_val) &&
+             idx > m_idx)) {
+            m_val = val;
+            m_idx = idx;
         }
-    };
+    }
+};
 
-    template<typename T>
-    struct MinMaxOp<af_max_t, T>
+template<typename T>
+struct MinMaxOp<af_max_t, T>
+{
+    T m_val;
+    uint m_idx;
+    MinMaxOp(T val, uint idx) :
+        m_val(val), m_idx(idx)
     {
-        T m_val;
-        uint m_idx;
-        MinMaxOp(T val, uint idx) :
-            m_val(val), m_idx(idx)
-        {
-        }
+    }
 
-        void operator()(T val, uint idx)
-        {
-            if (cabs(val) > cabs(m_val) ||
-                (cabs(val) == cabs(m_val) &&
-                 idx <= m_idx)) {
-                m_val = val;
-                m_idx = idx;
-            }
+    void operator()(T val, uint idx)
+    {
+        if (cabs(val) > cabs(m_val) ||
+            (cabs(val) == cabs(m_val) &&
+             idx <= m_idx)) {
+            m_val = val;
+            m_idx = idx;
         }
-    };
+    }
+};
 
-    template<af_op_t op, typename T, int D>
-    struct ireduce_dim
+template<af_op_t op, typename T, int D>
+struct ireduce_dim
+{
+    void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+                    const Array<T> input, const dim_t inOffset, const int dim)
     {
-        void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
-                        const Array<T> input, const dim_t inOffset, const int dim)
-        {
-            const dim4 odims    = output.dims();
-            const dim4 ostrides = output.strides();
-            const dim4 istrides = input.strides();
-            const int D1 = D - 1;
-            for (dim_t i = 0; i < odims[D1]; i++) {
-                ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
-                                         input, inOffset + i * istrides[D1], dim);
-            }
+        const dim4 odims    = output.dims();
+        const dim4 ostrides = output.strides();
+        const dim4 istrides = input.strides();
+        const int D1 = D - 1;
+        for (dim_t i = 0; i < odims[D1]; i++) {
+            ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
+                                     input, inOffset + i * istrides[D1], dim);
         }
-    };
+    }
+};
 
-    template<af_op_t op, typename T>
-    struct ireduce_dim<op, T, 0>
+template<af_op_t op, typename T>
+struct ireduce_dim<op, T, 0>
+{
+    void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+                    const Array<T> input, const dim_t inOffset, const int dim)
     {
-        void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
-                        const Array<T> input, const dim_t inOffset, const int dim)
-        {
-            const dim4 idims = input.dims();
-            const dim4 istrides = input.strides();
-
-            T const * const in = input.get();
-            T * out = output.get();
-            uint * loc = locArray.get();
-
-            dim_t stride = istrides[dim];
-            MinMaxOp<op, T> Op(in[0], 0);
-            for (dim_t i = 0; i < idims[dim]; i++) {
-                Op(in[inOffset + i * stride], i);
-            }
+        const dim4 idims = input.dims();
+        const dim4 istrides = input.strides();
 
-            *(out+outOffset) = Op.m_val;
-            *(loc+outOffset) = Op.m_idx;
-        }
-    };
+        T const * const in = input.get();
+        T * out = output.get();
+        uint * loc = locArray.get();
 
-    template<af_op_t op, typename T>
-    using ireduce_dim_func = std::function<void(Array<T>, Array<uint>, const dim_t,
-                                                const Array<T>, const dim_t, const int)>;
+        dim_t stride = istrides[dim];
+        MinMaxOp<op, T> Op(in[0], 0);
+        for (dim_t i = 0; i < idims[dim]; i++) {
+            Op(in[inOffset + i * stride], i);
+        }
 
-    template<af_op_t op, typename T>
-    void ireduce(Array<T> &out, Array<uint> &loc, const Array<T> &in, const int dim)
-    {
-        out.eval();
-        loc.eval();
-        in.eval();
-
-        dim4 odims = in.dims();
-        odims[dim] = 1;
-        static const ireduce_dim_func<op, T> ireduce_funcs[] = { ireduce_dim<op, T, 1>()
-                                                               , ireduce_dim<op, T, 2>()
-                                                               , ireduce_dim<op, T, 3>()
-                                                               , ireduce_dim<op, T, 4>()};
-
-        getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim);
+        *(out+outOffset) = Op.m_val;
+        *(loc+outOffset) = Op.m_idx;
     }
+};
 
-    template<af_op_t op, typename T>
-    T ireduce_all(unsigned *loc, const Array<T> &in)
-    {
-        in.eval();
-        getQueue().sync();
+template<af_op_t op, typename T>
+using ireduce_dim_func = std::function<void(Array<T>, Array<uint>, const dim_t,
+                                            const Array<T>, const dim_t, const int)>;
 
-        af::dim4 dims = in.dims();
-        af::dim4 strides = in.strides();
-        const T *inPtr = in.get();
+template<af_op_t op, typename T>
+void ireduce(Array<T> &out, Array<uint> &loc, const Array<T> &in, const int dim)
+{
+    out.eval();
+    loc.eval();
+    in.eval();
+
+    dim4 odims = in.dims();
+    odims[dim] = 1;
+    static const ireduce_dim_func<op, T> ireduce_funcs[] = { ireduce_dim<op, T, 1>()
+                                                           , ireduce_dim<op, T, 2>()
+                                                           , ireduce_dim<op, T, 3>()
+                                                           , ireduce_dim<op, T, 4>()};
+
+    getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim);
+}
 
-        MinMaxOp<op, T> Op(inPtr[0], 0);
+template<af_op_t op, typename T>
+T ireduce_all(unsigned *loc, const Array<T> &in)
+{
+    in.eval();
+    getQueue().sync();
+
+    af::dim4 dims = in.dims();
+    af::dim4 strides = in.strides();
+    const T *inPtr = in.get();
 
-        for(dim_t l = 0; l < dims[3]; l++) {
-            dim_t off3 = l * strides[3];
+    MinMaxOp<op, T> Op(inPtr[0], 0);
 
-            for(dim_t k = 0; k < dims[2]; k++) {
-                dim_t off2 = k * strides[2];
+    for(dim_t l = 0; l < dims[3]; l++) {
+        dim_t off3 = l * strides[3];
 
-                for(dim_t j = 0; j < dims[1]; j++) {
-                    dim_t off1 = j * strides[1];
+        for(dim_t k = 0; k < dims[2]; k++) {
+            dim_t off2 = k * strides[2];
 
-                    for(dim_t i = 0; i < dims[0]; i++) {
-                        dim_t idx = i + off1 + off2 + off3;
-                        Op(inPtr[idx], idx);
-                    }
+            for(dim_t j = 0; j < dims[1]; j++) {
+                dim_t off1 = j * strides[1];
+
+                for(dim_t i = 0; i < dims[0]; i++) {
+                    dim_t idx = i + off1 + off2 + off3;
+                    Op(inPtr[idx], idx);
                 }
             }
         }
-
-        *loc = Op.m_idx;
-        return Op.m_val;
     }
 
+    *loc = Op.m_idx;
+    return Op.m_val;
+}
+
 #define INSTANTIATE(ROp, T)                                             \
     template void ireduce<ROp, T>(Array<T> &out, Array<uint> &loc,      \
                                   const Array<T> &in, const int dim);   \
     template T ireduce_all<ROp, T>(unsigned *loc, const Array<T> &in);  \
 
-    //min
-    INSTANTIATE(af_min_t, float  )
-    INSTANTIATE(af_min_t, double )
-    INSTANTIATE(af_min_t, cfloat )
-    INSTANTIATE(af_min_t, cdouble)
-    INSTANTIATE(af_min_t, int    )
-    INSTANTIATE(af_min_t, uint   )
-    INSTANTIATE(af_min_t, intl   )
-    INSTANTIATE(af_min_t, uintl  )
-    INSTANTIATE(af_min_t, char   )
-    INSTANTIATE(af_min_t, uchar  )
-    INSTANTIATE(af_min_t, short  )
-    INSTANTIATE(af_min_t, ushort )
-
-    //max
-    INSTANTIATE(af_max_t, float  )
-    INSTANTIATE(af_max_t, double )
-    INSTANTIATE(af_max_t, cfloat )
-    INSTANTIATE(af_max_t, cdouble)
-    INSTANTIATE(af_max_t, int    )
-    INSTANTIATE(af_max_t, uint   )
-    INSTANTIATE(af_max_t, intl   )
-    INSTANTIATE(af_max_t, uintl  )
-    INSTANTIATE(af_max_t, char   )
-    INSTANTIATE(af_max_t, uchar  )
-    INSTANTIATE(af_max_t, short  )
-    INSTANTIATE(af_max_t, ushort )
+//min
+INSTANTIATE(af_min_t, float  )
+INSTANTIATE(af_min_t, double )
+INSTANTIATE(af_min_t, cfloat )
+INSTANTIATE(af_min_t, cdouble)
+INSTANTIATE(af_min_t, int    )
+INSTANTIATE(af_min_t, uint   )
+INSTANTIATE(af_min_t, intl   )
+INSTANTIATE(af_min_t, uintl  )
+INSTANTIATE(af_min_t, char   )
+INSTANTIATE(af_min_t, uchar  )
+INSTANTIATE(af_min_t, short  )
+INSTANTIATE(af_min_t, ushort )
+
+//max
+INSTANTIATE(af_max_t, float  )
+INSTANTIATE(af_max_t, double )
+INSTANTIATE(af_max_t, cfloat )
+INSTANTIATE(af_max_t, cdouble)
+INSTANTIATE(af_max_t, int    )
+INSTANTIATE(af_max_t, uint   )
+INSTANTIATE(af_max_t, intl   )
+INSTANTIATE(af_max_t, uintl  )
+INSTANTIATE(af_max_t, char   )
+INSTANTIATE(af_max_t, uchar  )
+INSTANTIATE(af_max_t, short  )
+INSTANTIATE(af_max_t, ushort )
+
 }
diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp
index ff0be43..9a04613 100644
--- a/src/backend/cpu/lu.cpp
+++ b/src/backend/cpu/lu.cpp
@@ -104,6 +104,8 @@ void convertPivot(Array<int> p, Array<int> pivot)
 template<typename T>
 void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
 {
+    in.eval();
+
     dim4 iDims = in.dims();
     int M = iDims[0];
     int N = iDims[1];
@@ -123,6 +125,8 @@ void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
 template<typename T>
 Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
 {
+    in.eval();
+
     dim4 iDims = in.dims();
     Array<int> pivot = createEmptyArray<int>(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1));
 
@@ -166,6 +170,7 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
 
 namespace cpu
 {
+
 #define INSTANTIATE_LU(T)                                                                           \
     template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot);                      \
     template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
@@ -174,4 +179,5 @@ INSTANTIATE_LU(float)
 INSTANTIATE_LU(cfloat)
 INSTANTIATE_LU(double)
 INSTANTIATE_LU(cdouble)
+
 }
diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp
index 02a4888..d4ce95a 100644
--- a/src/backend/cpu/match_template.cpp
+++ b/src/backend/cpu/match_template.cpp
@@ -24,6 +24,9 @@ namespace cpu
 template<typename inType, typename outType, af_match_type mType>
 Array<outType> match_template(const Array<inType> &sImg, const Array<inType> &tImg)
 {
+    sImg.eval();
+    tImg.eval();
+
     Array<outType> out = createEmptyArray<outType>(sImg.dims());
 
     auto func = [=](Array<outType> out, const Array<inType> sImg, const Array<inType> tImg) {
diff --git a/src/backend/cpu/math.cpp b/src/backend/cpu/math.cpp
index 5a6bcbc..e00fd78 100644
--- a/src/backend/cpu/math.cpp
+++ b/src/backend/cpu/math.cpp
@@ -11,39 +11,41 @@
 
 namespace cpu
 {
-    uint abs(uint val) { return val; }
-    uchar abs(uchar val) { return val; }
-    uintl abs(uintl val) { return val; }
-
-    cfloat  scalar(float val)
-    {
-        cfloat  cval = {(float)val, 0};
-        return cval;
-    }
-
-    cdouble scalar(double val)
-    {
-        cdouble  cval = {val, 0};
-        return cval;
-    }
-
-    cfloat min(cfloat lhs, cfloat rhs)
-    {
-        return abs(lhs) < abs(rhs) ? lhs : rhs;
-    }
-
-    cdouble min(cdouble lhs, cdouble rhs)
-    {
-        return abs(lhs) < abs(rhs) ? lhs : rhs;
-    }
-
-    cfloat max(cfloat lhs, cfloat rhs)
-    {
-        return abs(lhs) > abs(rhs) ? lhs : rhs;
-    }
-
-    cdouble max(cdouble lhs, cdouble rhs)
-    {
-        return abs(lhs) > abs(rhs) ? lhs : rhs;
-    }
+
+uint abs(uint val) { return val; }
+uchar abs(uchar val) { return val; }
+uintl abs(uintl val) { return val; }
+
+cfloat  scalar(float val)
+{
+    cfloat  cval = {(float)val, 0};
+    return cval;
+}
+
+cdouble scalar(double val)
+{
+    cdouble  cval = {val, 0};
+    return cval;
+}
+
+cfloat min(cfloat lhs, cfloat rhs)
+{
+    return abs(lhs) < abs(rhs) ? lhs : rhs;
+}
+
+cdouble min(cdouble lhs, cdouble rhs)
+{
+    return abs(lhs) < abs(rhs) ? lhs : rhs;
+}
+
+cfloat max(cfloat lhs, cfloat rhs)
+{
+    return abs(lhs) > abs(rhs) ? lhs : rhs;
+}
+
+cdouble max(cdouble lhs, cdouble rhs)
+{
+    return abs(lhs) > abs(rhs) ? lhs : rhs;
+}
+
 }
diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp
index 3f99d15..62b80e0 100644
--- a/src/backend/cpu/meanshift.cpp
+++ b/src/backend/cpu/meanshift.cpp
@@ -33,6 +33,8 @@ inline dim_t clamp(dim_t a, dim_t mn, dim_t mx)
 template<typename T, bool is_color>
 Array<T>  meanshift(const Array<T> &in, const float &s_sigma, const float &c_sigma, const unsigned iter)
 {
+    in.eval();
+
     Array<T> out = createEmptyArray<T>(in.dims());
 
     auto func = [=] (Array<T> out, const Array<T> in, const float s_sigma,
diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp
index ce921fc..4e74a55 100644
--- a/src/backend/cpu/medfilt.cpp
+++ b/src/backend/cpu/medfilt.cpp
@@ -25,6 +25,8 @@ namespace cpu
 template<typename T, af_border_type pad>
 Array<T> medfilt(const Array<T> &in, dim_t w_len, dim_t w_wid)
 {
+    in.eval();
+
     Array<T> out        = createEmptyArray<T>(in.dims());
 
     auto func = [=] (Array<T> out, const Array<T> in,
diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp
index 73120b9..e11f994 100644
--- a/src/backend/cpu/memory.cpp
+++ b/src/backend/cpu/memory.cpp
@@ -20,211 +20,211 @@
 namespace cpu
 {
 
-    static size_t memory_resolution = 1024; //1KB
+static size_t memory_resolution = 1024; //1KB
 
-    void setMemStepSize(size_t step_bytes)
-    {
-        memory_resolution = step_bytes;
-    }
+void setMemStepSize(size_t step_bytes)
+{
+    memory_resolution = step_bytes;
+}
+
+size_t getMemStepSize(void)
+{
+    return memory_resolution;
+}
 
-    size_t getMemStepSize(void)
+class Manager
+{
+    public:
+    static bool initialized;
+    Manager()
     {
-        return memory_resolution;
+        initialized = true;
     }
 
-    class Manager
+    ~Manager()
     {
-        public:
-        static bool initialized;
-        Manager()
-        {
-            initialized = true;
-        }
-
-        ~Manager()
-        {
-            garbageCollect();
-        }
-    };
+        garbageCollect();
+    }
+};
 
-    bool Manager::initialized = false;
+bool Manager::initialized = false;
 
-    static void managerInit()
-    {
-        if(Manager::initialized == false)
-            static Manager pm = Manager();
-    }
+static void managerInit()
+{
+    if(Manager::initialized == false)
+        static Manager pm = Manager();
+}
 
-    typedef struct
-    {
-        bool is_free;
-        bool is_unlinked;
-        size_t bytes;
-    } mem_info;
-
-    static size_t used_bytes = 0;
-    static size_t used_buffers = 0;
-    static size_t total_bytes = 0;
-    typedef std::map<void *, mem_info> mem_t;
-    typedef mem_t::iterator mem_iter;
-
-    mem_t memory_map;
-    std::mutex memory_map_mutex;
-
-    template<typename T>
-    void freeWrapper(T *ptr)
-    {
-        free((void *)ptr);
-    }
+typedef struct
+{
+    bool is_free;
+    bool is_unlinked;
+    size_t bytes;
+} mem_info;
+
+static size_t used_bytes = 0;
+static size_t used_buffers = 0;
+static size_t total_bytes = 0;
+typedef std::map<void *, mem_info> mem_t;
+typedef mem_t::iterator mem_iter;
+
+mem_t memory_map;
+std::mutex memory_map_mutex;
+
+template<typename T>
+void freeWrapper(T *ptr)
+{
+    free((void *)ptr);
+}
 
-    void garbageCollect()
-    {
-        for(mem_iter iter = memory_map.begin();
-            iter != memory_map.end(); ++iter) {
+void garbageCollect()
+{
+    for(mem_iter iter = memory_map.begin();
+        iter != memory_map.end(); ++iter) {
 
-            if ((iter->second).is_free) {
+        if ((iter->second).is_free) {
 
-                if (!(iter->second).is_unlinked) {
-                    freeWrapper(iter->first);
-                    total_bytes -= iter->second.bytes;
-                }
+            if (!(iter->second).is_unlinked) {
+                freeWrapper(iter->first);
+                total_bytes -= iter->second.bytes;
             }
         }
+    }
 
-        mem_iter memory_curr = memory_map.begin();
-        mem_iter memory_end  = memory_map.end();
+    mem_iter memory_curr = memory_map.begin();
+    mem_iter memory_end  = memory_map.end();
 
-        while(memory_curr != memory_end) {
-            if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) {
-                memory_map.erase(memory_curr++);
-            } else {
-                ++memory_curr;
-            }
+    while(memory_curr != memory_end) {
+        if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) {
+            memory_map.erase(memory_curr++);
+        } else {
+            ++memory_curr;
         }
     }
+}
 
-    template<typename T>
-    T* memAlloc(const size_t &elements)
-    {
-        managerInit();
+template<typename T>
+T* memAlloc(const size_t &elements)
+{
+    managerInit();
 
-        T* ptr = NULL;
-        size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution;
+    T* ptr = NULL;
+    size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution;
 
-        if (elements > 0) {
-            std::lock_guard<std::mutex> lock(memory_map_mutex);
+    if (elements > 0) {
+        std::lock_guard<std::mutex> lock(memory_map_mutex);
 
-            // FIXME: Add better checks for garbage collection
-            // Perhaps look at total memory available as a metric
-            if (memory_map.size() > MAX_BUFFERS ||
-                used_bytes >= MAX_BYTES) {
+        // FIXME: Add better checks for garbage collection
+        // Perhaps look at total memory available as a metric
+        if (memory_map.size() > MAX_BUFFERS ||
+            used_bytes >= MAX_BYTES) {
 
-                garbageCollect();
-            }
+            garbageCollect();
+        }
 
-            for(mem_iter iter = memory_map.begin();
-                iter != memory_map.end(); ++iter) {
+        for(mem_iter iter = memory_map.begin();
+            iter != memory_map.end(); ++iter) {
 
-                mem_info info = iter->second;
+            mem_info info = iter->second;
 
-                if ( info.is_free &&
-                    !info.is_unlinked &&
-                     info.bytes == alloc_bytes) {
+            if ( info.is_free &&
+                !info.is_unlinked &&
+                 info.bytes == alloc_bytes) {
 
-                    iter->second.is_free = false;
-                    used_bytes += alloc_bytes;
-                    used_buffers++;
-                    return (T *)iter->first;
-                }
+                iter->second.is_free = false;
+                used_bytes += alloc_bytes;
+                used_buffers++;
+                return (T *)iter->first;
             }
+        }
 
-            // Perform garbage collection if memory can not be allocated
-            ptr = (T *)malloc(alloc_bytes);
+        // Perform garbage collection if memory can not be allocated
+        ptr = (T *)malloc(alloc_bytes);
 
-            if (ptr == NULL) {
-                AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM);
-            }
+        if (ptr == NULL) {
+            AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM);
+        }
 
-            mem_info info = {false, false, alloc_bytes};
-            memory_map[ptr] = info;
+        mem_info info = {false, false, alloc_bytes};
+        memory_map[ptr] = info;
 
-            used_bytes += alloc_bytes;
-            used_buffers++;
-            total_bytes += alloc_bytes;
-        }
-        return ptr;
+        used_bytes += alloc_bytes;
+        used_buffers++;
+        total_bytes += alloc_bytes;
     }
+    return ptr;
+}
 
-    template<typename T>
-    void memFree(T *ptr)
-    {
-        std::lock_guard<std::mutex> lock(memory_map_mutex);
+template<typename T>
+void memFree(T *ptr)
+{
+    std::lock_guard<std::mutex> lock(memory_map_mutex);
 
-        mem_iter iter = memory_map.find((void *)ptr);
+    mem_iter iter = memory_map.find((void *)ptr);
 
-        if (iter != memory_map.end()) {
+    if (iter != memory_map.end()) {
 
-            iter->second.is_free = true;
-            if ((iter->second).is_unlinked) return;
+        iter->second.is_free = true;
+        if ((iter->second).is_unlinked) return;
 
-            used_bytes -= iter->second.bytes;
-            used_buffers--;
+        used_bytes -= iter->second.bytes;
+        used_buffers--;
 
-        } else {
-            freeWrapper(ptr); // Free it because we are not sure what the size is
-        }
+    } else {
+        freeWrapper(ptr); // Free it because we are not sure what the size is
     }
+}
 
-    template<typename T>
-    void memPop(const T *ptr)
-    {
-        std::lock_guard<std::mutex> lock(memory_map_mutex);
+template<typename T>
+void memPop(const T *ptr)
+{
+    std::lock_guard<std::mutex> lock(memory_map_mutex);
 
-        mem_iter iter = memory_map.find((void *)ptr);
+    mem_iter iter = memory_map.find((void *)ptr);
 
-        if (iter != memory_map.end()) {
-            iter->second.is_unlinked = true;
-        } else {
-            mem_info info = { false,
-                              true,
-                              100 }; //This number is not relevant
+    if (iter != memory_map.end()) {
+        iter->second.is_unlinked = true;
+    } else {
+        mem_info info = { false,
+                          true,
+                          100 }; //This number is not relevant
 
-            memory_map[(void *)ptr] = info;
-        }
+        memory_map[(void *)ptr] = info;
     }
+}
 
-    template<typename T>
-    void memPush(const T *ptr)
-    {
-        std::lock_guard<std::mutex> lock(memory_map_mutex);
-        mem_iter iter = memory_map.find((void *)ptr);
-        if (iter != memory_map.end()) {
-            iter->second.is_unlinked = false;
-        }
+template<typename T>
+void memPush(const T *ptr)
+{
+    std::lock_guard<std::mutex> lock(memory_map_mutex);
+    mem_iter iter = memory_map.find((void *)ptr);
+    if (iter != memory_map.end()) {
+        iter->second.is_unlinked = false;
     }
+}
 
 
-    void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
-                          size_t *lock_bytes,  size_t *lock_buffers)
-    {
-        getQueue().sync();
-        if (alloc_bytes   ) *alloc_bytes   = total_bytes;
-        if (alloc_buffers ) *alloc_buffers = memory_map.size();
-        if (lock_bytes    ) *lock_bytes    = used_bytes;
-        if (lock_buffers  ) *lock_buffers  = used_buffers;
-    }
+void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+                      size_t *lock_bytes,  size_t *lock_buffers)
+{
+    getQueue().sync();
+    if (alloc_bytes   ) *alloc_bytes   = total_bytes;
+    if (alloc_buffers ) *alloc_buffers = memory_map.size();
+    if (lock_bytes    ) *lock_bytes    = used_bytes;
+    if (lock_buffers  ) *lock_buffers  = used_buffers;
+}
 
-    template<typename T>
-    T* pinnedAlloc(const size_t &elements)
-    {
-        return memAlloc<T>(elements);
-    }
+template<typename T>
+T* pinnedAlloc(const size_t &elements)
+{
+    return memAlloc<T>(elements);
+}
 
-    template<typename T>
-    void pinnedFree(T* ptr)
-    {
-        memFree<T>(ptr);
-    }
+template<typename T>
+void pinnedFree(T* ptr)
+{
+    memFree<T>(ptr);
+}
 
 #define INSTANTIATE(T)                                  \
     template T* memAlloc(const size_t &elements);       \
@@ -234,16 +234,17 @@ namespace cpu
     template T* pinnedAlloc(const size_t &elements);    \
     template void pinnedFree(T* ptr);                   \
 
-    INSTANTIATE(float)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(double)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(ushort)
-    INSTANTIATE(short )
+INSTANTIATE(float)
+INSTANTIATE(cfloat)
+INSTANTIATE(double)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(ushort)
+INSTANTIATE(short )
+
 }
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
index 97f0e0a..b6f50c2 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -163,8 +163,6 @@ void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
 
     idx  = createEmptyArray<uint>(outDims);
     dist = createEmptyArray<To  >(outDims);
-    idx.eval();
-    dist.eval();
 
     switch(dist_type) {
         case AF_SAD:
diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp
index b5f1806..78631fc 100644
--- a/src/backend/cpu/qr.cpp
+++ b/src/backend/cpu/qr.cpp
@@ -59,6 +59,8 @@ GQR_FUNC(gqr , cdouble, zungqr)
 template<typename T>
 void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
 {
+    in.eval();
+
     dim4 iDims = in.dims();
     int M      = iDims[0];
     int N      = iDims[1];
@@ -83,6 +85,8 @@ void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
 template<typename T>
 Array<T> qr_inplace(Array<T> &in)
 {
+    in.eval();
+
     dim4 iDims = in.dims();
     int M      = iDims[0];
     int N      = iDims[1];
@@ -121,6 +125,7 @@ Array<T> qr_inplace(Array<T> &in)
 
 namespace cpu
 {
+
 #define INSTANTIATE_QR(T)                                                                           \
     template Array<T> qr_inplace<T>(Array<T> &in);                                                \
     template void qr<T>(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
@@ -129,4 +134,5 @@ INSTANTIATE_QR(float)
 INSTANTIATE_QR(cfloat)
 INSTANTIATE_QR(double)
 INSTANTIATE_QR(cdouble)
+
 }
diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp
index 1fa46b2..7837db5 100644
--- a/src/backend/cpu/range.cpp
+++ b/src/backend/cpu/range.cpp
@@ -19,6 +19,7 @@
 
 namespace cpu
 {
+
 ///////////////////////////////////////////////////////////////////////////
 // Kernel Functions
 ///////////////////////////////////////////////////////////////////////////
@@ -90,4 +91,5 @@ INSTANTIATE(uintl)
 INSTANTIATE(uchar)
 INSTANTIATE(ushort)
 INSTANTIATE(short)
+
 }
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp
index afe5620..1ad7dad 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/reorder.cpp
@@ -16,70 +16,70 @@
 
 namespace cpu
 {
-    template<typename T>
-    void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
-    {
-        T* outPtr = out.get();
-        const T* inPtr = in.get();
 
-        const af::dim4 ist = in.strides();
-        const af::dim4 ost = out.strides();
+template<typename T>
+void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
+{
+    T* outPtr = out.get();
+    const T* inPtr = in.get();
+
+    const af::dim4 ist = in.strides();
+    const af::dim4 ost = out.strides();
 
 
-        dim_t ids[4]  = {0};
-        for(dim_t ow = 0; ow < oDims[3]; ow++) {
-            const dim_t oW = ow * ost[3];
-            ids[rdims[3]] = ow;
-            for(dim_t oz = 0; oz < oDims[2]; oz++) {
-                const dim_t oZW = oW + oz * ost[2];
-                ids[rdims[2]] = oz;
-                for(dim_t oy = 0; oy < oDims[1]; oy++) {
-                    const dim_t oYZW = oZW + oy * ost[1];
-                    ids[rdims[1]] = oy;
-                    for(dim_t ox = 0; ox < oDims[0]; ox++) {
-                        const dim_t oIdx = oYZW + ox;
+    dim_t ids[4]  = {0};
+    for(dim_t ow = 0; ow < oDims[3]; ow++) {
+        const dim_t oW = ow * ost[3];
+        ids[rdims[3]] = ow;
+        for(dim_t oz = 0; oz < oDims[2]; oz++) {
+            const dim_t oZW = oW + oz * ost[2];
+            ids[rdims[2]] = oz;
+            for(dim_t oy = 0; oy < oDims[1]; oy++) {
+                const dim_t oYZW = oZW + oy * ost[1];
+                ids[rdims[1]] = oy;
+                for(dim_t ox = 0; ox < oDims[0]; ox++) {
+                    const dim_t oIdx = oYZW + ox;
 
-                        ids[rdims[0]] = ox;
-                        const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
-                                              ids[1] * ist[1] + ids[0];
+                    ids[rdims[0]] = ox;
+                    const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
+                                          ids[1] * ist[1] + ids[0];
 
-                        outPtr[oIdx] = inPtr[iIdx];
-                    }
+                    outPtr[oIdx] = inPtr[iIdx];
                 }
             }
         }
     }
+}
 
-    template<typename T>
-    Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
-    {
-        in.eval();
+template<typename T>
+Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
+{
+    in.eval();
 
-        const af::dim4 iDims = in.dims();
-        af::dim4 oDims(0);
-        for(int i = 0; i < 4; i++)
-            oDims[i] = iDims[rdims[i]];
+    const af::dim4 iDims = in.dims();
+    af::dim4 oDims(0);
+    for(int i = 0; i < 4; i++)
+        oDims[i] = iDims[rdims[i]];
 
-        Array<T> out = createEmptyArray<T>(oDims);
-        getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
-        return out;
-    }
+    Array<T> out = createEmptyArray<T>(oDims);
+    getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
+    return out;
+}
 
 #define INSTANTIATE(T)                                                         \
     template Array<T> reorder<T>(const Array<T> &in, const af::dim4 &rdims);  \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp
index 160ed46..8fb2edc 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/resize.cpp
@@ -19,6 +19,7 @@
 
 namespace cpu
 {
+
 /**
  * noop function for round to avoid compilation
  * issues due to lack of this function in C90 based
@@ -215,4 +216,5 @@ INSTANTIATE(uchar)
 INSTANTIATE(char)
 INSTANTIATE(short)
 INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp
index 01ec962..5687d69 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/rotate.cpp
@@ -18,106 +18,110 @@
 
 namespace cpu
 {
-    template<typename T, af_interp_type method>
-    void rotate_(Array<T> output, const Array<T> input, const float theta)
+
+template<typename T, af_interp_type method>
+void rotate_(Array<T> output, const Array<T> input, const float theta)
+{
+    const af::dim4 odims    = output.dims();
+    const af::dim4 idims    = input.dims();
+    const af::dim4 ostrides = output.strides();
+    const af::dim4 istrides = input.strides();
+
+    const T* in   = input.get();
+          T* out  = output.get();
+    dim_t nimages = idims[2];
+
+    void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
+                 const af::dim4 &, const af::dim4 &,
+                 const dim_t, const dim_t, const dim_t, const dim_t);
+
+    const float c = cos(-theta), s = sin(-theta);
+    float tx, ty;
     {
-        const af::dim4 odims    = output.dims();
-        const af::dim4 idims    = input.dims();
-        const af::dim4 ostrides = output.strides();
-        const af::dim4 istrides = input.strides();
-
-        const T* in   = input.get();
-              T* out  = output.get();
-        dim_t nimages = idims[2];
-
-        void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
-                     const af::dim4 &, const af::dim4 &,
-                     const dim_t, const dim_t, const dim_t, const dim_t);
-
-        const float c = cos(-theta), s = sin(-theta);
-        float tx, ty;
-        {
-            const float nx = 0.5 * (idims[0] - 1);
-            const float ny = 0.5 * (idims[1] - 1);
-            const float mx = 0.5 * (odims[0] - 1);
-            const float my = 0.5 * (odims[1] - 1);
-            const float sx = (mx * c + my *-s);
-            const float sy = (mx * s + my * c);
-            tx = -(sx - nx);
-            ty = -(sy - ny);
-        }
+        const float nx = 0.5 * (idims[0] - 1);
+        const float ny = 0.5 * (idims[1] - 1);
+        const float mx = 0.5 * (odims[0] - 1);
+        const float my = 0.5 * (odims[1] - 1);
+        const float sx = (mx * c + my *-s);
+        const float sy = (mx * s + my * c);
+        tx = -(sx - nx);
+        ty = -(sy - ny);
+    }
 
-        const float tmat[6] = {std::round( c * 1000) / 1000.0f,
-                               std::round(-s * 1000) / 1000.0f,
-                               std::round(tx * 1000) / 1000.0f,
-                               std::round( s * 1000) / 1000.0f,
-                               std::round( c * 1000) / 1000.0f,
-                               std::round(ty * 1000) / 1000.0f,
-                              };
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                t_fn = &transform_n;
-                break;
-            case AF_INTERP_BILINEAR:
-                t_fn = &transform_b;
-                break;
-            case AF_INTERP_LOWER:
-                t_fn = &transform_l;
-                break;
-            default:
-                AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                break;
-        }
+    const float tmat[6] = {std::round( c * 1000) / 1000.0f,
+                           std::round(-s * 1000) / 1000.0f,
+                           std::round(tx * 1000) / 1000.0f,
+                           std::round( s * 1000) / 1000.0f,
+                           std::round( c * 1000) / 1000.0f,
+                           std::round(ty * 1000) / 1000.0f,
+                          };
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            t_fn = &transform_n;
+            break;
+        case AF_INTERP_BILINEAR:
+            t_fn = &transform_b;
+            break;
+        case AF_INTERP_LOWER:
+            t_fn = &transform_l;
+            break;
+        default:
+            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+            break;
+    }
 
 
-        // Do transform for image
-        for(int yy = 0; yy < (int)odims[1]; yy++) {
-            for(int xx = 0; xx < (int)odims[0]; xx++) {
-                t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy);
-            }
+    // Do transform for image
+    for(int yy = 0; yy < (int)odims[1]; yy++) {
+        for(int xx = 0; xx < (int)odims[0]; xx++) {
+            t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy);
         }
     }
+}
 
-    template<typename T>
-    Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
-                     const af_interp_type method)
-    {
-        Array<T> out = createEmptyArray<T>(odims);
-
-        switch(method) {
-            case AF_INTERP_NEAREST:
-                getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
-                break;
-            case AF_INTERP_BILINEAR:
-                getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
-                break;
-            case AF_INTERP_LOWER:
-                getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
-                break;
-            default:
-                AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-                break;
-        }
+template<typename T>
+Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
+                 const af_interp_type method)
+{
+    in.eval();
 
-        return out;
+    Array<T> out = createEmptyArray<T>(odims);
+
+    switch(method) {
+        case AF_INTERP_NEAREST:
+            getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
+            break;
+        case AF_INTERP_BILINEAR:
+            getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
+            break;
+        case AF_INTERP_LOWER:
+            getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
+            break;
+        default:
+            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+            break;
     }
 
+    return out;
+}
+
 
 #define INSTANTIATE(T)                                                              \
     template Array<T> rotate(const Array<T> &in, const float theta,                 \
                              const af::dim4 &odims, const af_interp_type method);
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(uchar)
-    INSTANTIATE(char)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp
index 67aa586..d6321bb 100644
--- a/src/backend/cpu/set.cpp
+++ b/src/backend/cpu/set.cpp
@@ -23,116 +23,118 @@
 
 namespace cpu
 {
-    using namespace std;
-    using af::dim4;
-
-    template<typename T>
-    Array<T> setUnique(const Array<T> &in,
-                        const bool is_sorted)
-    {
-        in.eval();
-
-        Array<T> out = createEmptyArray<T>(af::dim4());
-        if (is_sorted) out = copyArray<T>(in);
-        else           out = sort<T, 1>(in, 0);
-
-        // Need to sync old jobs since we need to
-        // operator on pointers directly in std::unique
-        getQueue().sync();
-
-        T *ptr = out.get();
-        T *last = std::unique(ptr, ptr + in.elements());
-        dim_t dist = (dim_t)std::distance(ptr, last);
-
-        dim4 dims(dist, 1, 1, 1);
-        out.resetDims(dims);
-        return out;
-    }
 
-    template<typename T>
-    Array<T> setUnion(const Array<T> &first,
-                       const Array<T> &second,
-                       const bool is_unique)
-    {
-        first.eval();
-        second.eval();
-        getQueue().sync();
+using namespace std;
+using af::dim4;
 
-        Array<T> uFirst = first;
-        Array<T> uSecond = second;
+template<typename T>
+Array<T> setUnique(const Array<T> &in,
+                    const bool is_sorted)
+{
+    in.eval();
 
-        if (!is_unique) {
-            // FIXME: Perhaps copy + unique would do ?
-            uFirst  = setUnique(first, false);
-            uSecond = setUnique(second, false);
-        }
+    Array<T> out = createEmptyArray<T>(af::dim4());
+    if (is_sorted) out = copyArray<T>(in);
+    else           out = sort<T, 1>(in, 0);
 
-        dim_t first_elements  = uFirst.elements();
-        dim_t second_elements = uSecond.elements();
-        dim_t elements = first_elements + second_elements;
+    // Need to sync old jobs since we need to
+    // operator on pointers directly in std::unique
+    getQueue().sync();
 
-        Array<T> out = createEmptyArray<T>(af::dim4(elements));
+    T *ptr = out.get();
+    T *last = std::unique(ptr, ptr + in.elements());
+    dim_t dist = (dim_t)std::distance(ptr, last);
 
-        T *ptr = out.get();
-        T *last = std::set_union(uFirst.get() , uFirst.get()  + first_elements,
-                                 uSecond.get(), uSecond.get() + second_elements,
-                                 ptr);
+    dim4 dims(dist, 1, 1, 1);
+    out.resetDims(dims);
+    return out;
+}
 
-        dim_t dist = (dim_t)std::distance(ptr, last);
-        dim4 dims(dist, 1, 1, 1);
-        out.resetDims(dims);
+template<typename T>
+Array<T> setUnion(const Array<T> &first,
+                   const Array<T> &second,
+                   const bool is_unique)
+{
+    first.eval();
+    second.eval();
+    getQueue().sync();
 
-        return out;
+    Array<T> uFirst = first;
+    Array<T> uSecond = second;
+
+    if (!is_unique) {
+        // FIXME: Perhaps copy + unique would do ?
+        uFirst  = setUnique(first, false);
+        uSecond = setUnique(second, false);
     }
 
-    template<typename T>
-    Array<T> setIntersect(const Array<T> &first,
-                          const Array<T> &second,
-                          const bool is_unique)
-    {
-        first.eval();
-        second.eval();
-        getQueue().sync();
+    dim_t first_elements  = uFirst.elements();
+    dim_t second_elements = uSecond.elements();
+    dim_t elements = first_elements + second_elements;
 
-        Array<T> uFirst = first;
-        Array<T> uSecond = second;
+    Array<T> out = createEmptyArray<T>(af::dim4(elements));
 
-        if (!is_unique) {
-            uFirst  = setUnique(first, false);
-            uSecond = setUnique(second, false);
-        }
+    T *ptr = out.get();
+    T *last = std::set_union(uFirst.get() , uFirst.get()  + first_elements,
+                             uSecond.get(), uSecond.get() + second_elements,
+                             ptr);
 
-        dim_t first_elements  = uFirst.elements();
-        dim_t second_elements = uSecond.elements();
-        dim_t elements = std::max(first_elements, second_elements);
+    dim_t dist = (dim_t)std::distance(ptr, last);
+    dim4 dims(dist, 1, 1, 1);
+    out.resetDims(dims);
 
-        Array<T> out = createEmptyArray<T>(af::dim4(elements));
+    return out;
+}
 
-        T *ptr = out.get();
-        T *last = std::set_intersection(uFirst.get() , uFirst.get()  + first_elements,
-                                        uSecond.get(), uSecond.get() + second_elements,
-                                        ptr);
+template<typename T>
+Array<T> setIntersect(const Array<T> &first,
+                      const Array<T> &second,
+                      const bool is_unique)
+{
+    first.eval();
+    second.eval();
+    getQueue().sync();
 
-        dim_t dist = (dim_t)std::distance(ptr, last);
-        dim4 dims(dist, 1, 1, 1);
-        out.resetDims(dims);
+    Array<T> uFirst = first;
+    Array<T> uSecond = second;
 
-        return out;
+    if (!is_unique) {
+        uFirst  = setUnique(first, false);
+        uSecond = setUnique(second, false);
     }
 
+    dim_t first_elements  = uFirst.elements();
+    dim_t second_elements = uSecond.elements();
+    dim_t elements = std::max(first_elements, second_elements);
+
+    Array<T> out = createEmptyArray<T>(af::dim4(elements));
+
+    T *ptr = out.get();
+    T *last = std::set_intersection(uFirst.get() , uFirst.get()  + first_elements,
+                                    uSecond.get(), uSecond.get() + second_elements,
+                                    ptr);
+
+    dim_t dist = (dim_t)std::distance(ptr, last);
+    dim4 dims(dist, 1, 1, 1);
+    out.resetDims(dims);
+
+    return out;
+}
+
 #define INSTANTIATE(T)                                                  \
     template Array<T> setUnique<T>(const Array<T> &in, const bool is_sorted); \
     template Array<T> setUnion<T>(const Array<T> &first, const Array<T> &second, const bool is_unique); \
     template Array<T> setIntersect<T>(const Array<T> &first, const Array<T> &second, const bool is_unique); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
 }
diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp
index 6a2b939..766427b 100644
--- a/src/backend/cpu/shift.cpp
+++ b/src/backend/cpu/shift.cpp
@@ -17,6 +17,7 @@
 
 namespace cpu
 {
+
 static inline dim_t simple_mod(const dim_t i, const dim_t dim)
 {
     return (i < dim) ? i : (i - dim);
@@ -25,9 +26,9 @@ static inline dim_t simple_mod(const dim_t i, const dim_t dim)
 template<typename T>
 Array<T> shift(const Array<T> &in, const int sdims[4])
 {
-    Array<T> out = createEmptyArray<T>(in.dims());
-    out.eval();
     in.eval();
+
+    Array<T> out = createEmptyArray<T>(in.dims());
     const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]);
 
     auto func = [=] (Array<T> out, const Array<T> in, const af::dim4 sdims) {
@@ -91,4 +92,5 @@ INSTANTIATE(uchar)
 INSTANTIATE(char)
 INSTANTIATE(short)
 INSTANTIATE(ushort)
+
 }
diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp
index 853f407..c1c92a9 100644
--- a/src/backend/cpu/sift_nonfree.hpp
+++ b/src/backend/cpu/sift_nonfree.hpp
@@ -76,161 +76,161 @@ using af::dim4;
 namespace cpu
 {
 
-    static const float PI_VAL = 3.14159265358979323846f;
+static const float PI_VAL = 3.14159265358979323846f;
 
 // default width of descriptor histogram array
-    static const int DescrWidth = 4;
+static const int DescrWidth = 4;
 
 // default number of bins per histogram in descriptor array
-    static const int DescrHistBins = 8;
+static const int DescrHistBins = 8;
 
 // assumed gaussian blur for input image
-    static const float InitSigma = 0.5f;
+static const float InitSigma = 0.5f;
 
 // width of border in which to ignore keypoints
-    static const int ImgBorder = 5;
+static const int ImgBorder = 5;
 
 // maximum steps of keypoint interpolation before failure
-    static const int MaxInterpSteps = 5;
+static const int MaxInterpSteps = 5;
 
 // default number of bins in histogram for orientation assignment
-    static const int OriHistBins = 36;
+static const int OriHistBins = 36;
 
 // determines gaussian sigma for orientation assignment
-    static const float OriSigFctr = 1.5f;
+static const float OriSigFctr = 1.5f;
 
 // determines the radius of the region used in orientation assignment */
-    static const float OriRadius = 3.0f * OriSigFctr;
+static const float OriRadius = 3.0f * OriSigFctr;
 
 // number of passes of orientation histogram smoothing
-    static const int SmoothOriPasses = 2;
+static const int SmoothOriPasses = 2;
 
 // orientation magnitude relative to max that results in new feature
-    static const float OriPeakRatio = 0.8f;
+static const float OriPeakRatio = 0.8f;
 
 // determines the size of a single descriptor orientation histogram
-    static const float DescrSclFctr = 3.f;
+static const float DescrSclFctr = 3.f;
 
 // threshold on magnitude of elements of descriptor vector
-    static const float DescrMagThr = 0.2f;
+static const float DescrMagThr = 0.2f;
 
 // factor used to convert floating-point descriptor to unsigned char
-    static const float IntDescrFctr = 512.f;
+static const float IntDescrFctr = 512.f;
 
 // Number of GLOH bins in radial direction
-    static const unsigned GLOHRadialBins = 3;
+static const unsigned GLOHRadialBins = 3;
 
 // Radiuses of GLOH descriptors
-    static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
+static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
 
 // Number of GLOH angular bins (excluding the inner-most radial section)
-    static const unsigned GLOHAngularBins = 8;
+static const unsigned GLOHAngularBins = 8;
 
 // Number of GLOH bins per histogram in descriptor
-    static const unsigned GLOHHistBins = 16;
+static const unsigned GLOHHistBins = 16;
 
-    typedef struct
-    {
-        float    f[4];
-        unsigned l;
-    } feat_t;
+typedef struct
+{
+    float    f[4];
+    unsigned l;
+} feat_t;
 
-    bool feat_cmp(feat_t i, feat_t j)
-    {
-        for (int k = 0; k < 4; k++)
-            if (i.f[k] != j.f[k])
-                return (i.f[k] < j.f[k]);
-        if (i.l != j.l)
-            return (i.l < j.l);
+bool feat_cmp(feat_t i, feat_t j)
+{
+    for (int k = 0; k < 4; k++)
+        if (i.f[k] != j.f[k])
+            return (i.f[k] < j.f[k]);
+    if (i.l != j.l)
+        return (i.l < j.l);
 
-        return true;
-    }
+    return true;
+}
 
-    void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
-    {
-        feat.resize(nfeat);
-        for (unsigned i = 0; i < feat.size(); i++) {
-            feat[i].f[0] = x[i];
-            feat[i].f[1] = y[i];
-            feat[i].f[2] = resp[i];
-            feat[i].f[3] = size[i];
-            feat[i].l    = layer[i];
-        }
+void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
+{
+    feat.resize(nfeat);
+    for (unsigned i = 0; i < feat.size(); i++) {
+        feat[i].f[0] = x[i];
+        feat[i].f[1] = y[i];
+        feat[i].f[2] = resp[i];
+        feat[i].f[3] = size[i];
+        feat[i].l    = layer[i];
     }
+}
 
-    template<typename T>
-    void gaussian1D(T* out, const int dim, double sigma=0.0)
-    {
-        if(!(sigma>0)) sigma = 0.25*dim;
-
-        T sum = (T)0;
-        for(int i=0;i<dim;i++)
-        {
-            int x = i-(dim-1)/2;
-            T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-            out[i] = el;
-            sum   += el;
-        }
+template<typename T>
+void gaussian1D(T* out, const int dim, double sigma=0.0)
+{
+    if(!(sigma>0)) sigma = 0.25*dim;
 
-        for(int k=0;k<dim;k++)
-            out[k] /= sum;
+    T sum = (T)0;
+    for(int i=0;i<dim;i++)
+    {
+        int x = i-(dim-1)/2;
+        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
+        out[i] = el;
+        sum   += el;
     }
 
-    template<typename T>
-    Array<T> gauss_filter(float sigma)
-    {
-        // Using 6-sigma rule
-        unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
+    for(int k=0;k<dim;k++)
+        out[k] /= sum;
+}
 
-        Array<T> filter = createEmptyArray<T>(gauss_len);
-        gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
+template<typename T>
+Array<T> gauss_filter(float sigma)
+{
+    // Using 6-sigma rule
+    unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
 
-        return filter;
-    }
+    Array<T> filter = createEmptyArray<T>(gauss_len);
+    gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
 
-    template<int N>
-    void gaussianElimination(float* A, float* b, float* x)
-    {
-        // forward elimination
-        for (int i = 0; i < N-1; i++) {
-            for (int j = i+1; j < N; j++) {
-                float s = A[j*N+i] / A[i*N+i];
+    return filter;
+}
 
-                for (int k = i; k < N; k++)
-                    A[j*N+k] -= s * A[i*N+k];
+template<int N>
+void gaussianElimination(float* A, float* b, float* x)
+{
+    // forward elimination
+    for (int i = 0; i < N-1; i++) {
+        for (int j = i+1; j < N; j++) {
+            float s = A[j*N+i] / A[i*N+i];
 
-                b[j] -= s * b[i];
-            }
+            for (int k = i; k < N; k++)
+                A[j*N+k] -= s * A[i*N+k];
+
+            b[j] -= s * b[i];
         }
+    }
 
-        for (int i = 0; i < N; i++)
-            x[i] = 0;
+    for (int i = 0; i < N; i++)
+        x[i] = 0;
 
-        // backward substitution
-        float sum = 0;
-        for (int i = 0; i <= N-2; i++) {
-            sum = b[i];
-            for (int j = i+1; j < N; j++)
-                sum -= A[i*N+j] * x[j];
-            x[i] = sum / A[i*N+i];
-        }
+    // backward substitution
+    float sum = 0;
+    for (int i = 0; i <= N-2; i++) {
+        sum = b[i];
+        for (int j = i+1; j < N; j++)
+            sum -= A[i*N+j] * x[j];
+        x[i] = sum / A[i*N+i];
     }
+}
 
-    template<typename T>
-    void sub(
-        Array<T>& out,
-        const Array<T>& in1,
-        const Array<T>& in2)
-    {
-        size_t nel = in1.elements();
-        T* out_ptr = out.get();
-        const T* in1_ptr = in1.get();
-        const T* in2_ptr = in2.get();
+template<typename T>
+void sub(
+    Array<T>& out,
+    const Array<T>& in1,
+    const Array<T>& in2)
+{
+    size_t nel = in1.elements();
+    T* out_ptr = out.get();
+    const T* in1_ptr = in1.get();
+    const T* in2_ptr = in2.get();
 
-        for (size_t i = 0; i < nel; i++) {
-            out_ptr[i] = in1_ptr[i] - in2_ptr[i];
-        }
+    for (size_t i = 0; i < nel; i++) {
+        out_ptr[i] = in1_ptr[i] - in2_ptr[i];
     }
+}
 
 #define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)])
 #define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)])
@@ -238,957 +238,958 @@ namespace cpu
 
 // Determines whether a pixel is a scale-space extremum by comparing it to its
 // 3x3x3 pixel neighborhood.
-    template<typename T>
-    void detectExtrema(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        unsigned* counter,
-        const Array<T>& prev,
-        const Array<T>& center,
-        const Array<T>& next,
-        const unsigned layer,
-        const unsigned max_feat,
-        const float threshold)
-    {
-        const af::dim4 idims = center.dims();
-        const T* prev_ptr    = prev.get();
-        const T* center_ptr  = center.get();
-        const T* next_ptr    = next.get();
-
-        for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
-            for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
-                float p = center_ptr[y*idims[0] + x];
-
-                // Find extrema
-                if (abs((float)p) > threshold &&
-                    ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
-                      p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y,   x+1)  &&
-                      p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1)  &&
-                      p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1)  &&
-                      p > PPTR(y,   x-1) && p > PPTR(y  , x) && p > PPTR(y,   x+1)  &&
-                      p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1)  &&
-                      p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1)  &&
-                      p > NPTR(y,   x-1) && p > NPTR(y  , x) && p > NPTR(y,   x+1)  &&
-                      p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
-                     (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
-                      p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y,   x+1)  &&
-                      p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1)  &&
-                      p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1)  &&
-                      p < PPTR(y,   x-1) && p < PPTR(y  , x) && p < PPTR(y,   x+1)  &&
-                      p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1)  &&
-                      p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1)  &&
-                      p < NPTR(y,   x-1) && p < NPTR(y  , x) && p < NPTR(y,   x+1)  &&
-                      p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
-
-                    if (*counter < max_feat)
-                    {
-                        x_out[*counter] = (float)y;
-                        y_out[*counter] = (float)x;
-                        layer_out[*counter] = layer;
-                        (*counter)++;
-                    }
+template<typename T>
+void detectExtrema(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    unsigned* counter,
+    const Array<T>& prev,
+    const Array<T>& center,
+    const Array<T>& next,
+    const unsigned layer,
+    const unsigned max_feat,
+    const float threshold)
+{
+    const af::dim4 idims = center.dims();
+    const T* prev_ptr    = prev.get();
+    const T* center_ptr  = center.get();
+    const T* next_ptr    = next.get();
+
+    for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
+        for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
+            float p = center_ptr[y*idims[0] + x];
+
+            // Find extrema
+            if (abs((float)p) > threshold &&
+                ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
+                  p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y,   x+1)  &&
+                  p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1)  &&
+                  p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1)  &&
+                  p > PPTR(y,   x-1) && p > PPTR(y  , x) && p > PPTR(y,   x+1)  &&
+                  p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1)  &&
+                  p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1)  &&
+                  p > NPTR(y,   x-1) && p > NPTR(y  , x) && p > NPTR(y,   x+1)  &&
+                  p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
+                 (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
+                  p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y,   x+1)  &&
+                  p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1)  &&
+                  p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1)  &&
+                  p < PPTR(y,   x-1) && p < PPTR(y  , x) && p < PPTR(y,   x+1)  &&
+                  p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1)  &&
+                  p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1)  &&
+                  p < NPTR(y,   x-1) && p < NPTR(y  , x) && p < NPTR(y,   x+1)  &&
+                  p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
+
+                if (*counter < max_feat)
+                {
+                    x_out[*counter] = (float)y;
+                    y_out[*counter] = (float)x;
+                    layer_out[*counter] = layer;
+                    (*counter)++;
                 }
             }
         }
     }
+}
 
 // Interpolates a scale-space extremum's location and scale to subpixel
 // accuracy to form an image feature. Rejects features with low contrast.
 // Based on Section 4 of Lowe's paper.
-    template<typename T>
-    void interpolateExtrema(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        unsigned* counter,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const unsigned extrema_feat,
-        std::vector< Array<T> >& dog_pyr,
-        const unsigned max_feat,
-        const unsigned octave,
-        const unsigned n_layers,
-        const float contrast_thr,
-        const float edge_thr,
-        const float sigma,
-        const float img_scale)
-    {
-        for (int f = 0; f < (int)extrema_feat; f++) {
-            const float first_deriv_scale = img_scale*0.5f;
-            const float second_deriv_scale = img_scale;
-            const float cross_deriv_scale = img_scale*0.25f;
-
-            float xl = 0, xy = 0, xx = 0, contr = 0;
-            int i = 0;
-
-            unsigned x = x_in[f];
-            unsigned y = y_in[f];
-            unsigned layer = layer_in[f];
-
-            const T* prev_ptr   = dog_pyr[octave*(n_layers+2) + layer-1].get();
-            const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
-            const T* next_ptr   = dog_pyr[octave*(n_layers+2) + layer+1].get();
-
-            af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
-
-            bool converges = true;
-
-            for (i = 0; i < MaxInterpSteps; i++) {
-                float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
-                               (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
-                               (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-
-                float d2  = CPTR(x, y) * 2.f;
-                float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
-                float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
-                float dss = (NPTR(x, y  ) + PPTR(x, y  ) - d2) * second_deriv_scale;
-                float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
-                             CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
-                float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
-                             PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
-                float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
-                             PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
-                float H[9] = {dxx, dxy, dxs,
-                              dxy, dyy, dys,
-                              dxs, dys, dss};
-
-                float X[3];
-                gaussianElimination<3>(H, dD, X);
-
-                xl = -X[2];
-                xy = -X[1];
-                xx = -X[0];
-
-                if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
-                    break;
-
-                x += round(xx);
-                y += round(xy);
-                layer += round(xl);
-
-                if (layer < 1 || layer > n_layers ||
-                    x < ImgBorder || x >= idims[1] - ImgBorder ||
-                    y < ImgBorder || y >= idims[0] - ImgBorder) {
-                    converges = false;
-                    break;
-                }
-            }
+template<typename T>
+void interpolateExtrema(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    float* response_out,
+    float* size_out,
+    unsigned* counter,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const unsigned extrema_feat,
+    std::vector< Array<T> >& dog_pyr,
+    const unsigned max_feat,
+    const unsigned octave,
+    const unsigned n_layers,
+    const float contrast_thr,
+    const float edge_thr,
+    const float sigma,
+    const float img_scale)
+{
+    for (int f = 0; f < (int)extrema_feat; f++) {
+        const float first_deriv_scale = img_scale*0.5f;
+        const float second_deriv_scale = img_scale;
+        const float cross_deriv_scale = img_scale*0.25f;
 
-            // ensure convergence of interpolation
-            if (i >= MaxInterpSteps || !converges)
-                continue;
+        float xl = 0, xy = 0, xx = 0, contr = 0;
+        int i = 0;
+
+        unsigned x = x_in[f];
+        unsigned y = y_in[f];
+        unsigned layer = layer_in[f];
 
+        const T* prev_ptr   = dog_pyr[octave*(n_layers+2) + layer-1].get();
+        const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
+        const T* next_ptr   = dog_pyr[octave*(n_layers+2) + layer+1].get();
+
+        af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
+
+        bool converges = true;
+
+        for (i = 0; i < MaxInterpSteps; i++) {
             float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
                            (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
                            (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
-            float X[3] = {xx, xy, xl};
 
-            float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
-            contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
-            if(abs(contr) < (contrast_thr / n_layers))
-                continue;
-
-            // principal curvatures are computed using the trace and det of Hessian
             float d2  = CPTR(x, y) * 2.f;
             float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
             float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
+            float dss = (NPTR(x, y  ) + PPTR(x, y  ) - d2) * second_deriv_scale;
             float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
                          CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
+            float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
+                         PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
+            float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
+                         PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
+
+            float H[9] = {dxx, dxy, dxs,
+                          dxy, dyy, dys,
+                          dxs, dys, dss};
+
+            float X[3];
+            gaussianElimination<3>(H, dD, X);
+
+            xl = -X[2];
+            xy = -X[1];
+            xx = -X[0];
+
+            if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
+                break;
+
+            x += round(xx);
+            y += round(xy);
+            layer += round(xl);
+
+            if (layer < 1 || layer > n_layers ||
+                x < ImgBorder || x >= idims[1] - ImgBorder ||
+                y < ImgBorder || y >= idims[0] - ImgBorder) {
+                converges = false;
+                break;
+            }
+        }
 
-            float tr = dxx + dyy;
-            float det = dxx * dyy - dxy * dxy;
+        // ensure convergence of interpolation
+        if (i >= MaxInterpSteps || !converges)
+            continue;
 
-            // add FLT_EPSILON for double-precision compatibility
-            if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
-                continue;
+        float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
+                       (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
+                       (float)(NPTR(x, y)   - PPTR(x, y))   * first_deriv_scale};
+        float X[3] = {xx, xy, xl};
 
-            if (*counter < max_feat)
-            {
-                x_out[*counter] = (x + xx) * (1 << octave);
-                y_out[*counter] = (y + xy) * (1 << octave);
-                layer_out[*counter] = layer;
-                response_out[*counter] = abs(contr);
-                size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f;
-                (*counter)++;
-            }
+        float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
+
+        contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
+        if(abs(contr) < (contrast_thr / n_layers))
+            continue;
+
+        // principal curvatures are computed using the trace and det of Hessian
+        float d2  = CPTR(x, y) * 2.f;
+        float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
+        float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
+        float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
+                     CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
+
+        float tr = dxx + dyy;
+        float det = dxx * dyy - dxy * dxy;
+
+        // add FLT_EPSILON for double-precision compatibility
+        if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
+            continue;
+
+        if (*counter < max_feat)
+        {
+            x_out[*counter] = (x + xx) * (1 << octave);
+            y_out[*counter] = (y + xy) * (1 << octave);
+            layer_out[*counter] = layer;
+            response_out[*counter] = abs(contr);
+            size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f;
+            (*counter)++;
         }
     }
+}
 
 #undef CPTR
 #undef PPTR
 #undef NPTR
 
 // Remove duplicate keypoints
-    void removeDuplicates(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        unsigned* counter,
-        const std::vector<feat_t>& sorted_feat)
-    {
-        size_t nfeat = sorted_feat.size();
-
-        for (size_t f = 0; f < nfeat; f++) {
-            float prec_fctr = 1e4f;
-
-            if (f < nfeat-1) {
-                if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
-                    round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
-                    round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
-                    round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
-                    sorted_feat[f].l == sorted_feat[f+1].l)
-                    continue;
-            }
+void removeDuplicates(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    float* response_out,
+    float* size_out,
+    unsigned* counter,
+    const std::vector<feat_t>& sorted_feat)
+{
+    size_t nfeat = sorted_feat.size();
 
-            x_out[*counter] = sorted_feat[f].f[0];
-            y_out[*counter] = sorted_feat[f].f[1];
-            response_out[*counter] = sorted_feat[f].f[2];
-            size_out[*counter] = sorted_feat[f].f[3];
-            layer_out[*counter] = sorted_feat[f].l;
-            (*counter)++;
+    for (size_t f = 0; f < nfeat; f++) {
+        float prec_fctr = 1e4f;
+
+        if (f < nfeat-1) {
+            if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
+                round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
+                round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
+                round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
+                sorted_feat[f].l == sorted_feat[f+1].l)
+                continue;
         }
+
+        x_out[*counter] = sorted_feat[f].f[0];
+        y_out[*counter] = sorted_feat[f].f[1];
+        response_out[*counter] = sorted_feat[f].f[2];
+        size_out[*counter] = sorted_feat[f].f[3];
+        layer_out[*counter] = sorted_feat[f].l;
+        (*counter)++;
     }
+}
 
 #define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)])
 
 // Computes a canonical orientation for each image feature in an array.  Based
 // on Section 5 of Lowe's paper.  This function adds features to the array when
 // there is more than one dominant orientation at a given feature location.
-    template<typename T>
-    void calcOrientation(
-        float* x_out,
-        float* y_out,
-        unsigned* layer_out,
-        float* response_out,
-        float* size_out,
-        float* ori_out,
-        unsigned* counter,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const unsigned max_feat,
-        const unsigned octave,
-        const unsigned n_layers,
-        const bool double_input)
-    {
-        const int n = OriHistBins;
+template<typename T>
+void calcOrientation(
+    float* x_out,
+    float* y_out,
+    unsigned* layer_out,
+    float* response_out,
+    float* size_out,
+    float* ori_out,
+    unsigned* counter,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const float* response_in,
+    const float* size_in,
+    const unsigned total_feat,
+    const std::vector< Array<T> >& gauss_pyr,
+    const unsigned max_feat,
+    const unsigned octave,
+    const unsigned n_layers,
+    const bool double_input)
+{
+    const int n = OriHistBins;
 
-        float hist[OriHistBins];
-        float temphist[OriHistBins];
+    float hist[OriHistBins];
+    float temphist[OriHistBins];
 
-        for (unsigned f = 0; f < total_feat; f++) {
-            // Load keypoint information
-            const float real_x = x_in[f];
-            const float real_y = y_in[f];
-            const unsigned layer = layer_in[f];
-            const float response = response_in[f];
-            const float size = size_in[f];
+    for (unsigned f = 0; f < total_feat; f++) {
+        // Load keypoint information
+        const float real_x = x_in[f];
+        const float real_y = y_in[f];
+        const unsigned layer = layer_in[f];
+        const float response = response_in[f];
+        const float size = size_in[f];
 
-            const int pt_x = (int)round(real_x / (1 << octave));
-            const int pt_y = (int)round(real_y / (1 << octave));
+        const int pt_x = (int)round(real_x / (1 << octave));
+        const int pt_y = (int)round(real_y / (1 << octave));
 
-            // Calculate auxiliary parameters
-            const float scl_octv = size*0.5f / (1 << octave);
-            const int radius = (int)round(OriRadius * scl_octv);
-            const float sigma = OriSigFctr * scl_octv;
-            const int len = (radius*2+1);
-            const float exp_denom = 2.f * sigma * sigma;
+        // Calculate auxiliary parameters
+        const float scl_octv = size*0.5f / (1 << octave);
+        const int radius = (int)round(OriRadius * scl_octv);
+        const float sigma = OriSigFctr * scl_octv;
+        const int len = (radius*2+1);
+        const float exp_denom = 2.f * sigma * sigma;
 
-            // Points img to correct Gaussian pyramid layer
-            const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
+        // Points img to correct Gaussian pyramid layer
+        const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+        const T* img_ptr = img.get();
 
-            for (int i = 0; i < OriHistBins; i++)
-                hist[i] = 0.f;
+        for (int i = 0; i < OriHistBins; i++)
+            hist[i] = 0.f;
 
-            af::dim4 idims = img.dims();
+        af::dim4 idims = img.dims();
 
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
+        // Calculate orientation histogram
+        for (int l = 0; l < len*len; l++) {
+            int i = l / len - radius;
+            int j = l % len - radius;
 
-                int y = pt_y + i;
-                int x = pt_x + j;
-                if (y < 1 || y >= idims[0] - 1 ||
-                    x < 1 || x >= idims[1] - 1)
-                    continue;
+            int y = pt_y + i;
+            int x = pt_x + j;
+            if (y < 1 || y >= idims[0] - 1 ||
+                x < 1 || x >= idims[1] - 1)
+                continue;
 
-                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+            float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+            float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
 
-                float mag = sqrt(dx*dx+dy*dy);
-                float ori = atan2(dy,dx);
-                float w = exp(-(i*i + j*j)/exp_denom);
+            float mag = sqrt(dx*dx+dy*dy);
+            float ori = atan2(dy,dx);
+            float w = exp(-(i*i + j*j)/exp_denom);
 
-                int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
-                bin = bin < n ? bin : 0;
+            int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
+            bin = bin < n ? bin : 0;
 
-                hist[bin] += w*mag;
-            }
+            hist[bin] += w*mag;
+        }
 
-            for (int i = 0; i < SmoothOriPasses; i++) {
-                for (int j = 0; j < n; j++) {
-                    temphist[j] = hist[j];
-                }
-                for (int j = 0; j < n; j++) {
-                    float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
-                    float next = (j+1 == n) ? temphist[0] : temphist[j+1];
-                    hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
-                }
+        for (int i = 0; i < SmoothOriPasses; i++) {
+            for (int j = 0; j < n; j++) {
+                temphist[j] = hist[j];
             }
-
-            float omax = hist[0];
-            for (int i = 1; i < n; i++)
-                omax = max(omax, hist[i]);
-
-            float mag_thr = (float)(omax * OriPeakRatio);
-            int l, r;
             for (int j = 0; j < n; j++) {
-                l = (j == 0) ? n - 1 : j - 1;
-                r = (j + 1) % n;
-                if (hist[j] > hist[l] &&
-                    hist[j] > hist[r] &&
-                    hist[j] >= mag_thr) {
-                    if (*counter < max_feat) {
-                        float bin = j + 0.5f * (hist[l] - hist[r]) /
-                            (hist[l] - 2.0f*hist[j] + hist[r]);
-                        bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
-                        float ori = 360.f - ((360.f/n) * bin);
-
-                        float new_real_x = real_x;
-                        float new_real_y = real_y;
-                        float new_size = size;
-
-                        if (double_input) {
-                            float scale = 0.5f;
-                            new_real_x *= scale;
-                            new_real_y *= scale;
-                            new_size *= scale;
-                        }
+                float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
+                float next = (j+1 == n) ? temphist[0] : temphist[j+1];
+                hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
+            }
+        }
 
-                        x_out[*counter] = new_real_x;
-                        y_out[*counter] = new_real_y;
-                        layer_out[*counter] = layer;
-                        response_out[*counter] = response;
-                        size_out[*counter] = new_size;
-                        ori_out[*counter] = ori;
-                        (*counter)++;
+        float omax = hist[0];
+        for (int i = 1; i < n; i++)
+            omax = max(omax, hist[i]);
+
+        float mag_thr = (float)(omax * OriPeakRatio);
+        int l, r;
+        for (int j = 0; j < n; j++) {
+            l = (j == 0) ? n - 1 : j - 1;
+            r = (j + 1) % n;
+            if (hist[j] > hist[l] &&
+                hist[j] > hist[r] &&
+                hist[j] >= mag_thr) {
+                if (*counter < max_feat) {
+                    float bin = j + 0.5f * (hist[l] - hist[r]) /
+                        (hist[l] - 2.0f*hist[j] + hist[r]);
+                    bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
+                    float ori = 360.f - ((360.f/n) * bin);
+
+                    float new_real_x = real_x;
+                    float new_real_y = real_y;
+                    float new_size = size;
+
+                    if (double_input) {
+                        float scale = 0.5f;
+                        new_real_x *= scale;
+                        new_real_y *= scale;
+                        new_size *= scale;
                     }
+
+                    x_out[*counter] = new_real_x;
+                    y_out[*counter] = new_real_y;
+                    layer_out[*counter] = layer;
+                    response_out[*counter] = response;
+                    size_out[*counter] = new_size;
+                    ori_out[*counter] = ori;
+                    (*counter)++;
                 }
             }
         }
     }
+}
 
-    void normalizeDesc(
-        float* desc,
-        const int histlen)
-    {
-        float len_sq = 0.0f;
+void normalizeDesc(
+    float* desc,
+    const int histlen)
+{
+    float len_sq = 0.0f;
 
-        for (int i = 0; i < histlen; i++)
-            len_sq += desc[i] * desc[i];
+    for (int i = 0; i < histlen; i++)
+        len_sq += desc[i] * desc[i];
 
-        float len_inv = 1.0f / sqrt(len_sq);
+    float len_inv = 1.0f / sqrt(len_sq);
 
-        for (int i = 0; i < histlen; i++) {
-            desc[i] *= len_inv;
-        }
+    for (int i = 0; i < histlen; i++) {
+        desc[i] *= len_inv;
     }
+}
 
 // Computes feature descriptors for features in an array.  Based on Section 6
 // of Lowe's paper.
-    template<typename T>
-    void computeDescriptor(
-        float* desc_out,
-        const unsigned desc_len,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const float* ori_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const int d,
-        const int n,
-        const float scale,
-        const unsigned octave,
-        const unsigned n_layers)
-    {
-        float desc[128];
-
-        for (unsigned f = 0; f < total_feat; f++) {
-            const unsigned layer = layer_in[f];
-            float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
-            ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
-            const float size = size_in[f];
-            const int fx = round(x_in[f] * scale);
-            const int fy = round(y_in[f] * scale);
-
-            // Points img to correct Gaussian pyramid layer
-            Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
-            af::dim4 idims = img.dims();
-
-            float cos_t = cos(ori);
-            float sin_t = sin(ori);
-            float bins_per_rad = n / (PI_VAL * 2.f);
-            float exp_denom = d * d * 0.5f;
-            float hist_width = DescrSclFctr * size * scale * 0.5f;
-            int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
-            int len = radius*2+1;
-
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = 0.f;
-
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
-
-                int y = fy + i;
-                int x = fx + j;
-
-                float x_rot = (j * cos_t - i * sin_t) / hist_width;
-                float y_rot = (j * sin_t + i * cos_t) / hist_width;
-                float xbin = x_rot + d/2 - 0.5f;
-                float ybin = y_rot + d/2 - 0.5f;
-
-                if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
-                    y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
-                    float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                    float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                    float grad_mag = sqrt(dx*dx + dy*dy);
-                    float grad_ori = atan2(dy, dx) - ori;
-                    while (grad_ori < 0.0f)
-                        grad_ori += PI_VAL*2;
-                    while (grad_ori >= PI_VAL*2)
-                        grad_ori -= PI_VAL*2;
-
-                    float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
-                    float obin = grad_ori * bins_per_rad;
-                    float mag = grad_mag*w;
-
-                    int x0 = floor(xbin);
-                    int y0 = floor(ybin);
-                    int o0 = floor(obin);
-                    xbin -= x0;
-                    ybin -= y0;
-                    obin -= o0;
-
-                    for (int yl = 0; yl <= 1; yl++) {
-                        int yb = y0 + yl;
-                        if (yb >= 0 && yb < d) {
-                            float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
-                            for (int xl = 0; xl <= 1; xl++) {
-                                int xb = x0 + xl;
-                                if (xb >= 0 && xb < d) {
-                                    float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
-                                    for (int ol = 0; ol <= 1; ol++) {
-                                        int ob = (o0 + ol) % n;
-                                        float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
-                                        desc[(yb*d + xb)*n + ob] += v_o;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
+template<typename T>
+void computeDescriptor(
+    float* desc_out,
+    const unsigned desc_len,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const float* response_in,
+    const float* size_in,
+    const float* ori_in,
+    const unsigned total_feat,
+    const std::vector< Array<T> >& gauss_pyr,
+    const int d,
+    const int n,
+    const float scale,
+    const unsigned octave,
+    const unsigned n_layers)
+{
+    float desc[128];
+
+    for (unsigned f = 0; f < total_feat; f++) {
+        const unsigned layer = layer_in[f];
+        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+        const float size = size_in[f];
+        const int fx = round(x_in[f] * scale);
+        const int fy = round(y_in[f] * scale);
+
+        // Points img to correct Gaussian pyramid layer
+        Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+        const T* img_ptr = img.get();
+        af::dim4 idims = img.dims();
 
-            normalizeDesc(desc, desc_len);
+        float cos_t = cos(ori);
+        float sin_t = sin(ori);
+        float bins_per_rad = n / (PI_VAL * 2.f);
+        float exp_denom = d * d * 0.5f;
+        float hist_width = DescrSclFctr * size * scale * 0.5f;
+        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
 
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = min(desc[i], DescrMagThr);
+        int len = radius*2+1;
 
-            normalizeDesc(desc, desc_len);
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = 0.f;
 
-            // Calculate final descriptor values
-            for (int k = 0; k < (int)desc_len; k++) {
-                desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
-            }
-        }
-    }
+        // Calculate orientation histogram
+        for (int l = 0; l < len*len; l++) {
+            int i = l / len - radius;
+            int j = l % len - radius;
 
-// Computes GLOH feature descriptors for features in an array. Based on Section III-B
-// of Mikolajczyk and Schmid paper.
-    template<typename T>
-    void computeGLOHDescriptor(
-        float* desc_out,
-        const unsigned desc_len,
-        const float* x_in,
-        const float* y_in,
-        const unsigned* layer_in,
-        const float* response_in,
-        const float* size_in,
-        const float* ori_in,
-        const unsigned total_feat,
-        const std::vector< Array<T> >& gauss_pyr,
-        const int d,
-        const unsigned rb,
-        const unsigned ab,
-        const unsigned hb,
-        const float scale,
-        const unsigned octave,
-        const unsigned n_layers)
-    {
-        float desc[272];
-
-        for (unsigned f = 0; f < total_feat; f++) {
-            const unsigned layer = layer_in[f];
-            float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
-            ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
-            const float size = size_in[f];
-            const int fx = round(x_in[f] * scale);
-            const int fy = round(y_in[f] * scale);
-
-            // Points img to correct Gaussian pyramid layer
-            Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
-            const T* img_ptr = img.get();
-            af::dim4 idims = img.dims();
-
-            float cos_t = cos(ori);
-            float sin_t = sin(ori);
-            float hist_bins_per_rad = hb / (PI_VAL * 2.f);
-            float polar_bins_per_rad = ab / (PI_VAL * 2.f);
-            float exp_denom = GLOHRadii[rb-1] * 0.5f;
-
-            float hist_width = DescrSclFctr * size * scale * 0.5f;
-
-            // Keep same descriptor radius used for SIFT
-            int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
-            // Alternative radius size calculation, changing the radius weight
-            // (rw) in the range of 0.25f-0.75f gives different results,
-            // increasing it tends to show a better recall rate but with a
-            // smaller amount of correct matches
-            //float rw = 0.5f;
-            //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
-
-            int len = radius*2+1;
-
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = 0.f;
-
-            // Calculate orientation histogram
-            for (int l = 0; l < len*len; l++) {
-                int i = l / len - radius;
-                int j = l % len - radius;
-
-                int y = fy + i;
-                int x = fx + j;
-
-                float x_rot = (j * cos_t - i * sin_t);
-                float y_rot = (j * sin_t + i * cos_t);
-
-                float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
-                float theta = atan2(y_rot, x_rot);
-                while (theta < 0.0f)
-                    theta += PI_VAL*2;
-                while (theta >= PI_VAL*2)
-                    theta -= PI_VAL*2;
-
-                float tbin = theta * polar_bins_per_rad;
-                float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
-                             ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
-                             min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
-
-                if (r <= GLOHRadii[rb-1] &&
-                    y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
-                    float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
-                    float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
-                    float grad_mag = sqrt(dx*dx + dy*dy);
-                    float grad_ori = atan2(dy, dx) - ori;
-                    while (grad_ori < 0.0f)
-                        grad_ori += PI_VAL*2;
-                    while (grad_ori >= PI_VAL*2)
-                        grad_ori -= PI_VAL*2;
-
-                    float w = exp(-r / exp_denom);
-                    float obin = grad_ori * hist_bins_per_rad;
-                    float mag = grad_mag*w;
-
-                    int t0 = floor(tbin);
-                    int r0 = floor(rbin);
-                    int o0 = floor(obin);
-                    tbin -= t0;
-                    rbin -= r0;
-                    obin -= o0;
-
-                    for (int rl = 0; rl <= 1; rl++) {
-                        int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
-                        float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
-                        if (rb >= 0 && rb <= 2) {
-                            for (int tl = 0; tl <= 1; tl++) {
-                                int tb = (t0 + tl) % ab;
-                                float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+            int y = fy + i;
+            int x = fx + j;
+
+            float x_rot = (j * cos_t - i * sin_t) / hist_width;
+            float y_rot = (j * sin_t + i * cos_t) / hist_width;
+            float xbin = x_rot + d/2 - 0.5f;
+            float ybin = y_rot + d/2 - 0.5f;
+
+            if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
+                y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
+                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+                float grad_mag = sqrt(dx*dx + dy*dy);
+                float grad_ori = atan2(dy, dx) - ori;
+                while (grad_ori < 0.0f)
+                    grad_ori += PI_VAL*2;
+                while (grad_ori >= PI_VAL*2)
+                    grad_ori -= PI_VAL*2;
+
+                float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
+                float obin = grad_ori * bins_per_rad;
+                float mag = grad_mag*w;
+
+                int x0 = floor(xbin);
+                int y0 = floor(ybin);
+                int o0 = floor(obin);
+                xbin -= x0;
+                ybin -= y0;
+                obin -= o0;
+
+                for (int yl = 0; yl <= 1; yl++) {
+                    int yb = y0 + yl;
+                    if (yb >= 0 && yb < d) {
+                        float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
+                        for (int xl = 0; xl <= 1; xl++) {
+                            int xb = x0 + xl;
+                            if (xb >= 0 && xb < d) {
+                                float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
                                 for (int ol = 0; ol <= 1; ol++) {
-                                    int ob = (o0 + ol) % hb;
-                                    float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
-                                    unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
-                                    desc[idx] += v_o;
+                                    int ob = (o0 + ol) % n;
+                                    float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
+                                    desc[(yb*d + xb)*n + ob] += v_o;
                                 }
                             }
                         }
                     }
                 }
             }
+        }
 
-            normalizeDesc(desc, desc_len);
+        normalizeDesc(desc, desc_len);
 
-            for (int i = 0; i < (int)desc_len; i++)
-                desc[i] = min(desc[i], DescrMagThr);
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = min(desc[i], DescrMagThr);
 
-            normalizeDesc(desc, desc_len);
+        normalizeDesc(desc, desc_len);
 
-            // Calculate final descriptor values
-            for (int k = 0; k < (int)desc_len; k++) {
-                desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
-            }
+        // Calculate final descriptor values
+        for (int k = 0; k < (int)desc_len; k++) {
+            desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
         }
     }
+}
 
-#undef IPTR
-
-    template<typename T, typename convAccT>
-    Array<T> createInitialImage(
-        const Array<T>& img,
-        const float init_sigma,
-        const bool double_input)
-    {
+// Computes GLOH feature descriptors for features in an array. Based on Section III-B
+// of Mikolajczyk and Schmid paper.
+template<typename T>
+void computeGLOHDescriptor(
+    float* desc_out,
+    const unsigned desc_len,
+    const float* x_in,
+    const float* y_in,
+    const unsigned* layer_in,
+    const float* response_in,
+    const float* size_in,
+    const float* ori_in,
+    const unsigned total_feat,
+    const std::vector< Array<T> >& gauss_pyr,
+    const int d,
+    const unsigned rb,
+    const unsigned ab,
+    const unsigned hb,
+    const float scale,
+    const unsigned octave,
+    const unsigned n_layers)
+{
+    float desc[272];
+
+    for (unsigned f = 0; f < total_feat; f++) {
+        const unsigned layer = layer_in[f];
+        float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+        ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+        const float size = size_in[f];
+        const int fx = round(x_in[f] * scale);
+        const int fy = round(y_in[f] * scale);
+
+        // Points img to correct Gaussian pyramid layer
+        Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+        const T* img_ptr = img.get();
         af::dim4 idims = img.dims();
 
-        Array<T> init_img = createEmptyArray<T>(af::dim4());
+        float cos_t = cos(ori);
+        float sin_t = sin(ori);
+        float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+        float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+        float exp_denom = GLOHRadii[rb-1] * 0.5f;
 
-        float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f)
-                                 : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f);
+        float hist_width = DescrSclFctr * size * scale * 0.5f;
 
-        Array<T> filter = gauss_filter<T>(s);
+        // Keep same descriptor radius used for SIFT
+        int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
 
-        if (double_input) {
-            Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
-            init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
-        }
-        else {
-            init_img = convolve2<T, convAccT, false>(img, filter, filter);
-        }
+        // Alternative radius size calculation, changing the radius weight
+        // (rw) in the range of 0.25f-0.75f gives different results,
+        // increasing it tends to show a better recall rate but with a
+        // smaller amount of correct matches
+        //float rw = 0.5f;
+        //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
 
-        return init_img;
-    }
+        int len = radius*2+1;
 
-    template<typename T, typename convAccT>
-    std::vector< Array<T> > buildGaussPyr(
-        const Array<T>& init_img,
-        const unsigned n_octaves,
-        const unsigned n_layers,
-        const float init_sigma)
-    {
-        // Precompute Gaussian sigmas using the following formula:
-        // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
-        std::vector<float> sig_layers(n_layers + 3);
-        sig_layers[0] = init_sigma;
-        float k = std::pow(2.0f, 1.0f / n_layers);
-        for (unsigned i = 1; i < n_layers + 3; i++) {
-            float sig_prev = std::pow(k, i-1) * init_sigma;
-            float sig_total = sig_prev * k;
-            sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
-        }
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = 0.f;
 
-        // Gaussian Pyramid
-        std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
-        for (unsigned o = 0; o < n_octaves; o++) {
-            for (unsigned l = 0; l < n_layers+3; l++) {
-                unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
-                unsigned idx = o*(n_layers+3) + l;
+        // Calculate orientation histogram
+        for (int l = 0; l < len*len; l++) {
+            int i = l / len - radius;
+            int j = l % len - radius;
 
-                if (o == 0 && l == 0) {
-                    gauss_pyr[idx] = init_img;
-                }
-                else if (l == 0) {
-                    af::dim4 sdims = gauss_pyr[src_idx].dims();
-                    gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
-                }
-                else {
-                    Array<T> filter = gauss_filter<T>(sig_layers[l]);
+            int y = fy + i;
+            int x = fx + j;
+
+            float x_rot = (j * cos_t - i * sin_t);
+            float y_rot = (j * sin_t + i * cos_t);
+
+            float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+            float theta = atan2(y_rot, x_rot);
+            while (theta < 0.0f)
+                theta += PI_VAL*2;
+            while (theta >= PI_VAL*2)
+                theta -= PI_VAL*2;
+
+            float tbin = theta * polar_bins_per_rad;
+            float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+                         ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+                         min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
 
-                    gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
+            if (r <= GLOHRadii[rb-1] &&
+                y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
+                float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+                float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+                float grad_mag = sqrt(dx*dx + dy*dy);
+                float grad_ori = atan2(dy, dx) - ori;
+                while (grad_ori < 0.0f)
+                    grad_ori += PI_VAL*2;
+                while (grad_ori >= PI_VAL*2)
+                    grad_ori -= PI_VAL*2;
+
+                float w = exp(-r / exp_denom);
+                float obin = grad_ori * hist_bins_per_rad;
+                float mag = grad_mag*w;
+
+                int t0 = floor(tbin);
+                int r0 = floor(rbin);
+                int o0 = floor(obin);
+                tbin -= t0;
+                rbin -= r0;
+                obin -= o0;
+
+                for (int rl = 0; rl <= 1; rl++) {
+                    int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+                    float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+                    if (rb >= 0 && rb <= 2) {
+                        for (int tl = 0; tl <= 1; tl++) {
+                            int tb = (t0 + tl) % ab;
+                            float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+                            for (int ol = 0; ol <= 1; ol++) {
+                                int ob = (o0 + ol) % hb;
+                                float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+                                unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+                                desc[idx] += v_o;
+                            }
+                        }
+                    }
                 }
             }
         }
 
-        return gauss_pyr;
-    }
+        normalizeDesc(desc, desc_len);
 
-    template<typename T>
-    std::vector< Array<T> > buildDoGPyr(
-        std::vector< Array<T> >& gauss_pyr,
-        const unsigned n_octaves,
-        const unsigned n_layers)
-    {
-        // DoG Pyramid
-        std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
-        for (unsigned o = 0; o < n_octaves; o++) {
-            for (unsigned l = 0; l < n_layers+2; l++) {
-                unsigned idx    = o*(n_layers+2) + l;
-                unsigned bottom = o*(n_layers+3) + l;
-                unsigned top    = o*(n_layers+3) + l+1;
+        for (int i = 0; i < (int)desc_len; i++)
+            desc[i] = min(desc[i], DescrMagThr);
 
-                dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
+        normalizeDesc(desc, desc_len);
 
-                sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
-            }
+        // Calculate final descriptor values
+        for (int k = 0; k < (int)desc_len; k++) {
+            desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
         }
-
-        return dog_pyr;
     }
+}
 
+#undef IPTR
 
-    template<typename T, typename convAccT>
-    unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
-                       Array<float>& ori, Array<float>& size, Array<float>& desc,
-                       const Array<T>& in, const unsigned n_layers,
-                       const float contrast_thr, const float edge_thr,
-                       const float init_sigma, const bool double_input,
-                       const float img_scale, const float feature_ratio,
-                       const bool compute_GLOH)
-    {
-        in.eval();
-        af::dim4 idims = in.dims();
+template<typename T, typename convAccT>
+Array<T> createInitialImage(
+    const Array<T>& img,
+    const float init_sigma,
+    const bool double_input)
+{
+    af::dim4 idims = img.dims();
 
-        const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
-            : min(idims[0], idims[1]);
-        const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
+    Array<T> init_img = createEmptyArray<T>(af::dim4());
 
-        Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
+    float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f)
+                             : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f);
 
-        std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
+    Array<T> filter = gauss_filter<T>(s);
 
-        std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
+    if (double_input) {
+        Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
+        init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
+    }
+    else {
+        init_img = convolve2<T, convAccT, false>(img, filter, filter);
+    }
 
-        std::vector<float*> x_pyr(n_octaves, NULL);
-        std::vector<float*> y_pyr(n_octaves, NULL);
-        std::vector<float*> response_pyr(n_octaves, NULL);
-        std::vector<float*> size_pyr(n_octaves, NULL);
-        std::vector<float*> ori_pyr(n_octaves, NULL);
-        std::vector<float*> desc_pyr(n_octaves, NULL);
-        std::vector<unsigned> feat_pyr(n_octaves, 0);
-        unsigned total_feat = 0;
+    return init_img;
+}
 
-        const unsigned d = DescrWidth;
-        const unsigned n = DescrHistBins;
-        const unsigned rb = GLOHRadialBins;
-        const unsigned ab = GLOHAngularBins;
-        const unsigned hb = GLOHHistBins;
-        const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
+template<typename T, typename convAccT>
+std::vector< Array<T> > buildGaussPyr(
+    const Array<T>& init_img,
+    const unsigned n_octaves,
+    const unsigned n_layers,
+    const float init_sigma)
+{
+    // Precompute Gaussian sigmas using the following formula:
+    // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
+    std::vector<float> sig_layers(n_layers + 3);
+    sig_layers[0] = init_sigma;
+    float k = std::pow(2.0f, 1.0f / n_layers);
+    for (unsigned i = 1; i < n_layers + 3; i++) {
+        float sig_prev = std::pow(k, i-1) * init_sigma;
+        float sig_total = sig_prev * k;
+        sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
+    }
 
-        for (unsigned i = 0; i < n_octaves; i++) {
-            af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
-            if (ddims[0]-2*ImgBorder < 1 ||
-                ddims[1]-2*ImgBorder < 1)
-                continue;
+    // Gaussian Pyramid
+    std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
+    for (unsigned o = 0; o < n_octaves; o++) {
+        for (unsigned l = 0; l < n_layers+3; l++) {
+            unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
+            unsigned idx = o*(n_layers+3) + l;
 
-            const unsigned imel = ddims[0] * ddims[1];
-            const unsigned max_feat = ceil(imel * feature_ratio);
+            if (o == 0 && l == 0) {
+                gauss_pyr[idx] = init_img;
+            }
+            else if (l == 0) {
+                af::dim4 sdims = gauss_pyr[src_idx].dims();
+                gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
+            }
+            else {
+                Array<T> filter = gauss_filter<T>(sig_layers[l]);
 
-            float* extrema_x = memAlloc<float>(max_feat);
-            float* extrema_y = memAlloc<float>(max_feat);
-            unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
-            unsigned extrema_feat = 0;
+                gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
+            }
+        }
+    }
 
-            for (unsigned j = 1; j <= n_layers; j++) {
-                unsigned prev   = i*(n_layers+2) + j-1;
-                unsigned center = i*(n_layers+2) + j;
-                unsigned next   = i*(n_layers+2) + j+1;
+    return gauss_pyr;
+}
 
-                unsigned layer = j;
+template<typename T>
+std::vector< Array<T> > buildDoGPyr(
+    std::vector< Array<T> >& gauss_pyr,
+    const unsigned n_octaves,
+    const unsigned n_layers)
+{
+    // DoG Pyramid
+    std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
+    for (unsigned o = 0; o < n_octaves; o++) {
+        for (unsigned l = 0; l < n_layers+2; l++) {
+            unsigned idx    = o*(n_layers+2) + l;
+            unsigned bottom = o*(n_layers+3) + l;
+            unsigned top    = o*(n_layers+3) + l+1;
 
-                float extrema_thr = 0.5f * contrast_thr / n_layers;
-                detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
-                                 dog_pyr[prev], dog_pyr[center], dog_pyr[next],
-                                 layer, max_feat, extrema_thr);
-            }
+            dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
 
-            extrema_feat = min(extrema_feat, max_feat);
+            sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
+        }
+    }
 
-            if (extrema_feat == 0) {
-                memFree(extrema_x);
-                memFree(extrema_y);
-                memFree(extrema_layer);
+    return dog_pyr;
+}
 
-                continue;
-            }
 
-            unsigned interp_feat = 0;
+template<typename T, typename convAccT>
+unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
+                   Array<float>& ori, Array<float>& size, Array<float>& desc,
+                   const Array<T>& in, const unsigned n_layers,
+                   const float contrast_thr, const float edge_thr,
+                   const float init_sigma, const bool double_input,
+                   const float img_scale, const float feature_ratio,
+                   const bool compute_GLOH)
+{
+    in.eval();
+    af::dim4 idims = in.dims();
+
+    const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
+        : min(idims[0], idims[1]);
+    const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
+
+    Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
+
+    std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
+
+    std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
+
+    std::vector<float*> x_pyr(n_octaves, NULL);
+    std::vector<float*> y_pyr(n_octaves, NULL);
+    std::vector<float*> response_pyr(n_octaves, NULL);
+    std::vector<float*> size_pyr(n_octaves, NULL);
+    std::vector<float*> ori_pyr(n_octaves, NULL);
+    std::vector<float*> desc_pyr(n_octaves, NULL);
+    std::vector<unsigned> feat_pyr(n_octaves, 0);
+    unsigned total_feat = 0;
+
+    const unsigned d = DescrWidth;
+    const unsigned n = DescrHistBins;
+    const unsigned rb = GLOHRadialBins;
+    const unsigned ab = GLOHAngularBins;
+    const unsigned hb = GLOHHistBins;
+    const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
+
+    for (unsigned i = 0; i < n_octaves; i++) {
+        af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
+        if (ddims[0]-2*ImgBorder < 1 ||
+            ddims[1]-2*ImgBorder < 1)
+            continue;
+
+        const unsigned imel = ddims[0] * ddims[1];
+        const unsigned max_feat = ceil(imel * feature_ratio);
+
+        float* extrema_x = memAlloc<float>(max_feat);
+        float* extrema_y = memAlloc<float>(max_feat);
+        unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
+        unsigned extrema_feat = 0;
+
+        for (unsigned j = 1; j <= n_layers; j++) {
+            unsigned prev   = i*(n_layers+2) + j-1;
+            unsigned center = i*(n_layers+2) + j;
+            unsigned next   = i*(n_layers+2) + j+1;
+
+            unsigned layer = j;
+
+            float extrema_thr = 0.5f * contrast_thr / n_layers;
+            detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
+                             dog_pyr[prev], dog_pyr[center], dog_pyr[next],
+                             layer, max_feat, extrema_thr);
+        }
+
+        extrema_feat = min(extrema_feat, max_feat);
 
-            float* interp_x = memAlloc<float>(extrema_feat);
-            float* interp_y = memAlloc<float>(extrema_feat);
-            unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
-            float* interp_response = memAlloc<float>(extrema_feat);
-            float* interp_size = memAlloc<float>(extrema_feat);
+        if (extrema_feat == 0) {
+            memFree(extrema_x);
+            memFree(extrema_y);
+            memFree(extrema_layer);
 
-            interpolateExtrema<T>(interp_x, interp_y, interp_layer,
-                                  interp_response, interp_size, &interp_feat,
-                                  extrema_x, extrema_y, extrema_layer, extrema_feat,
-                                  dog_pyr, max_feat, i, n_layers,
-                                  contrast_thr, edge_thr, init_sigma, img_scale);
+            continue;
+        }
 
-            interp_feat = min(interp_feat, max_feat);
+        unsigned interp_feat = 0;
 
-            if (interp_feat == 0) {
-                memFree(interp_x);
-                memFree(interp_y);
-                memFree(interp_layer);
-                memFree(interp_response);
-                memFree(interp_size);
+        float* interp_x = memAlloc<float>(extrema_feat);
+        float* interp_y = memAlloc<float>(extrema_feat);
+        unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
+        float* interp_response = memAlloc<float>(extrema_feat);
+        float* interp_size = memAlloc<float>(extrema_feat);
 
-                continue;
-            }
+        interpolateExtrema<T>(interp_x, interp_y, interp_layer,
+                              interp_response, interp_size, &interp_feat,
+                              extrema_x, extrema_y, extrema_layer, extrema_feat,
+                              dog_pyr, max_feat, i, n_layers,
+                              contrast_thr, edge_thr, init_sigma, img_scale);
 
-            std::vector<feat_t> sorted_feat;
-            array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
-            std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
+        interp_feat = min(interp_feat, max_feat);
 
+        if (interp_feat == 0) {
             memFree(interp_x);
             memFree(interp_y);
             memFree(interp_layer);
             memFree(interp_response);
             memFree(interp_size);
 
-            unsigned nodup_feat = 0;
-
-            float* nodup_x = memAlloc<float>(interp_feat);
-            float* nodup_y = memAlloc<float>(interp_feat);
-            unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
-            float* nodup_response = memAlloc<float>(interp_feat);
-            float* nodup_size = memAlloc<float>(interp_feat);
-
-            removeDuplicates(nodup_x, nodup_y, nodup_layer,
-                             nodup_response, nodup_size, &nodup_feat,
-                             sorted_feat);
-
-            const unsigned max_oriented_feat = nodup_feat * 3;
-
-            float* oriented_x = memAlloc<float>(max_oriented_feat);
-            float* oriented_y = memAlloc<float>(max_oriented_feat);
-            unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
-            float* oriented_response = memAlloc<float>(max_oriented_feat);
-            float* oriented_size = memAlloc<float>(max_oriented_feat);
-            float* oriented_ori = memAlloc<float>(max_oriented_feat);
-
-            unsigned oriented_feat = 0;
-
-            calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
-                               oriented_response, oriented_size, oriented_ori, &oriented_feat,
-                               nodup_x, nodup_y, nodup_layer,
-                               nodup_response, nodup_size, nodup_feat,
-                               gauss_pyr, max_oriented_feat, i, n_layers, double_input);
-
-            memFree(nodup_x);
-            memFree(nodup_y);
-            memFree(nodup_layer);
-            memFree(nodup_response);
-            memFree(nodup_size);
-
-            if (oriented_feat == 0) {
-                memFree(oriented_x);
-                memFree(oriented_y);
-                memFree(oriented_layer);
-                memFree(oriented_response);
-                memFree(oriented_size);
-                memFree(oriented_ori);
+            continue;
+        }
 
-                continue;
-            }
+        std::vector<feat_t> sorted_feat;
+        array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
+        std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
+
+        memFree(interp_x);
+        memFree(interp_y);
+        memFree(interp_layer);
+        memFree(interp_response);
+        memFree(interp_size);
+
+        unsigned nodup_feat = 0;
+
+        float* nodup_x = memAlloc<float>(interp_feat);
+        float* nodup_y = memAlloc<float>(interp_feat);
+        unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
+        float* nodup_response = memAlloc<float>(interp_feat);
+        float* nodup_size = memAlloc<float>(interp_feat);
+
+        removeDuplicates(nodup_x, nodup_y, nodup_layer,
+                         nodup_response, nodup_size, &nodup_feat,
+                         sorted_feat);
+
+        const unsigned max_oriented_feat = nodup_feat * 3;
+
+        float* oriented_x = memAlloc<float>(max_oriented_feat);
+        float* oriented_y = memAlloc<float>(max_oriented_feat);
+        unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
+        float* oriented_response = memAlloc<float>(max_oriented_feat);
+        float* oriented_size = memAlloc<float>(max_oriented_feat);
+        float* oriented_ori = memAlloc<float>(max_oriented_feat);
+
+        unsigned oriented_feat = 0;
+
+        calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
+                           oriented_response, oriented_size, oriented_ori, &oriented_feat,
+                           nodup_x, nodup_y, nodup_layer,
+                           nodup_response, nodup_size, nodup_feat,
+                           gauss_pyr, max_oriented_feat, i, n_layers, double_input);
+
+        memFree(nodup_x);
+        memFree(nodup_y);
+        memFree(nodup_layer);
+        memFree(nodup_response);
+        memFree(nodup_size);
+
+        if (oriented_feat == 0) {
+            memFree(oriented_x);
+            memFree(oriented_y);
+            memFree(oriented_layer);
+            memFree(oriented_response);
+            memFree(oriented_size);
+            memFree(oriented_ori);
+
+            continue;
+        }
 
-            float* desc = memAlloc<float>(oriented_feat * desc_len);
+        float* desc = memAlloc<float>(oriented_feat * desc_len);
 
-            float scale = 1.f/(1 << i);
-            if (double_input) scale *= 2.f;
+        float scale = 1.f/(1 << i);
+        if (double_input) scale *= 2.f;
 
-            if (compute_GLOH)
-                computeGLOHDescriptor<T>(desc, desc_len,
-                                         oriented_x, oriented_y, oriented_layer,
-                                         oriented_response, oriented_size, oriented_ori,
-                                         oriented_feat, gauss_pyr, d, rb, ab, hb,
-                                         scale, i, n_layers);
-            else
-                computeDescriptor<T>(desc, desc_len,
+        if (compute_GLOH)
+            computeGLOHDescriptor<T>(desc, desc_len,
                                      oriented_x, oriented_y, oriented_layer,
                                      oriented_response, oriented_size, oriented_ori,
-                                     oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
-
-            total_feat += oriented_feat;
-            feat_pyr[i] = oriented_feat;
-
-            if (oriented_feat > 0) {
-                x_pyr[i] = oriented_x;
-                y_pyr[i] = oriented_y;
-                response_pyr[i] = oriented_response;
-                ori_pyr[i] = oriented_ori;
-                size_pyr[i] = oriented_size;
-                desc_pyr[i] = desc;
-            }
+                                     oriented_feat, gauss_pyr, d, rb, ab, hb,
+                                     scale, i, n_layers);
+        else
+            computeDescriptor<T>(desc, desc_len,
+                                 oriented_x, oriented_y, oriented_layer,
+                                 oriented_response, oriented_size, oriented_ori,
+                                 oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
+
+        total_feat += oriented_feat;
+        feat_pyr[i] = oriented_feat;
+
+        if (oriented_feat > 0) {
+            x_pyr[i] = oriented_x;
+            y_pyr[i] = oriented_y;
+            response_pyr[i] = oriented_response;
+            ori_pyr[i] = oriented_ori;
+            size_pyr[i] = oriented_size;
+            desc_pyr[i] = desc;
         }
+    }
 
-        if (total_feat > 0) {
-            const af::dim4 total_feat_dims(total_feat);
-            const af::dim4 desc_dims(desc_len, total_feat);
-
-            // Allocate output memory
-            x     = createEmptyArray<float>(total_feat_dims);
-            y     = createEmptyArray<float>(total_feat_dims);
-            score = createEmptyArray<float>(total_feat_dims);
-            ori   = createEmptyArray<float>(total_feat_dims);
-            size  = createEmptyArray<float>(total_feat_dims);
-            desc  = createEmptyArray<float>(desc_dims);
-
-            float* x_ptr = x.get();
-            float* y_ptr = y.get();
-            float* score_ptr = score.get();
-            float* ori_ptr = ori.get();
-            float* size_ptr = size.get();
-            float* desc_ptr = desc.get();
-
-            unsigned offset = 0;
-            for (unsigned i = 0; i < n_octaves; i++) {
-                if (feat_pyr[i] == 0)
-                    continue;
-
-                memcpy(x_ptr+offset,     x_pyr[i],        feat_pyr[i] * sizeof(float));
-                memcpy(y_ptr+offset,     y_pyr[i],        feat_pyr[i] * sizeof(float));
-                memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
-                memcpy(ori_ptr+offset,   ori_pyr[i],      feat_pyr[i] * sizeof(float));
-                memcpy(size_ptr+offset,  size_pyr[i],     feat_pyr[i] * sizeof(float));
-
-                memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
-
-                memFree(x_pyr[i]);
-                memFree(y_pyr[i]);
-                memFree(response_pyr[i]);
-                memFree(ori_pyr[i]);
-                memFree(size_pyr[i]);
-                memFree(desc_pyr[i]);
-
-                offset += feat_pyr[i];
-            }
-        }
+    if (total_feat > 0) {
+        const af::dim4 total_feat_dims(total_feat);
+        const af::dim4 desc_dims(desc_len, total_feat);
+
+        // Allocate output memory
+        x     = createEmptyArray<float>(total_feat_dims);
+        y     = createEmptyArray<float>(total_feat_dims);
+        score = createEmptyArray<float>(total_feat_dims);
+        ori   = createEmptyArray<float>(total_feat_dims);
+        size  = createEmptyArray<float>(total_feat_dims);
+        desc  = createEmptyArray<float>(desc_dims);
+
+        float* x_ptr = x.get();
+        float* y_ptr = y.get();
+        float* score_ptr = score.get();
+        float* ori_ptr = ori.get();
+        float* size_ptr = size.get();
+        float* desc_ptr = desc.get();
+
+        unsigned offset = 0;
+        for (unsigned i = 0; i < n_octaves; i++) {
+            if (feat_pyr[i] == 0)
+                continue;
+
+            memcpy(x_ptr+offset,     x_pyr[i],        feat_pyr[i] * sizeof(float));
+            memcpy(y_ptr+offset,     y_pyr[i],        feat_pyr[i] * sizeof(float));
+            memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
+            memcpy(ori_ptr+offset,   ori_pyr[i],      feat_pyr[i] * sizeof(float));
+            memcpy(size_ptr+offset,  size_pyr[i],     feat_pyr[i] * sizeof(float));
+
+            memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
 
-        return total_feat;
+            memFree(x_pyr[i]);
+            memFree(y_pyr[i]);
+            memFree(response_pyr[i]);
+            memFree(ori_pyr[i]);
+            memFree(size_pyr[i]);
+            memFree(desc_pyr[i]);
+
+            offset += feat_pyr[i];
+        }
     }
+
+    return total_feat;
+}
+
 }
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp
index 9f683fc..ba47ba9 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/sobel.cpp
@@ -91,6 +91,7 @@ template<typename Ti, typename To>
 std::pair< Array<To>, Array<To> >
 sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
 {
+    img.eval();
     // ket_size is for future proofing, this argument is not used
     // currently
     Array<To> dx = createEmptyArray<To>(img.dims());
diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp
index b279971..0243088 100644
--- a/src/backend/cpu/solve.cpp
+++ b/src/backend/cpu/solve.cpp
@@ -75,6 +75,10 @@ template<typename T>
 Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
                  const Array<T> &b, const af_mat_prop options)
 {
+    A.eval();
+    pivot.eval();
+    b.eval();
+
     int N        = A.dims()[0];
     int NRHS     = b.dims()[1];
     Array< T > B = copyArray<T>(b);
@@ -114,9 +118,10 @@ Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop o
 template<typename T>
 Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
 {
+    a.eval();
+    b.eval();
 
-    if (options & AF_MAT_UPPER ||
-        options & AF_MAT_LOWER) {
+    if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) {
         return triangleSolve<T>(a, b, options);
     }
 
@@ -178,6 +183,7 @@ Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
 
 namespace cpu
 {
+
 #define INSTANTIATE_SOLVE(T)                                            \
     template Array<T> solve<T>(const Array<T> &a, const Array<T> &b,    \
                                const af_mat_prop options);              \
@@ -188,4 +194,5 @@ INSTANTIATE_SOLVE(float)
 INSTANTIATE_SOLVE(cfloat)
 INSTANTIATE_SOLVE(double)
 INSTANTIATE_SOLVE(cdouble)
+
 }
diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index 94d70a8..cbdb50e 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -25,65 +25,69 @@ using std::function;
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Kernel Functions
-    ///////////////////////////////////////////////////////////////////////////
 
-    // Based off of http://stackoverflow.com/a/12399290
-    template<typename T, bool isAscending>
-    void sort0(Array<T> val)
-    {
-        // initialize original index locations
-        T *val_ptr = val.get();
+///////////////////////////////////////////////////////////////////////////
+// Kernel Functions
+///////////////////////////////////////////////////////////////////////////
 
-        function<bool(T, T)> op = greater<T>();
-        if(isAscending) { op = less<T>(); }
+// Based off of http://stackoverflow.com/a/12399290
+template<typename T, bool isAscending>
+void sort0(Array<T> val)
+{
+    // initialize original index locations
+    T *val_ptr = val.get();
+
+    function<bool(T, T)> op = greater<T>();
+    if(isAscending) { op = less<T>(); }
 
-        T *comp_ptr = nullptr;
-        for(dim_t w = 0; w < val.dims()[3]; w++) {
-            dim_t valW = w * val.strides()[3];
-            for(dim_t z = 0; z < val.dims()[2]; z++) {
-                dim_t valWZ = valW + z * val.strides()[2];
-                for(dim_t y = 0; y < val.dims()[1]; y++) {
+    T *comp_ptr = nullptr;
+    for(dim_t w = 0; w < val.dims()[3]; w++) {
+        dim_t valW = w * val.strides()[3];
+        for(dim_t z = 0; z < val.dims()[2]; z++) {
+            dim_t valWZ = valW + z * val.strides()[2];
+            for(dim_t y = 0; y < val.dims()[1]; y++) {
 
-                    dim_t valOffset = valWZ + y * val.strides()[1];
+                dim_t valOffset = valWZ + y * val.strides()[1];
 
-                    comp_ptr = val_ptr + valOffset;
-                    std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
-                }
+                comp_ptr = val_ptr + valOffset;
+                std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
             }
         }
-        return;
     }
+    return;
+}
 
-    ///////////////////////////////////////////////////////////////////////////
-    // Wrapper Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T, bool isAscending>
-    Array<T> sort(const Array<T> &in, const unsigned dim)
-    {
-        Array<T> out = copyArray<T>(in);
-        switch(dim) {
-            case 0: getQueue().enqueue(sort0<T, isAscending>, out); break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
-        }
-        return out;
+///////////////////////////////////////////////////////////////////////////
+// Wrapper Functions
+///////////////////////////////////////////////////////////////////////////
+template<typename T, bool isAscending>
+Array<T> sort(const Array<T> &in, const unsigned dim)
+{
+    in.eval();
+
+    Array<T> out = copyArray<T>(in);
+    switch(dim) {
+        case 0: getQueue().enqueue(sort0<T, isAscending>, out); break;
+        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+    return out;
+}
 
 #define INSTANTIATE(T)                                                  \
     template Array<T> sort<T, true>(const Array<T> &in, const unsigned dim); \
     template Array<T> sort<T,false>(const Array<T> &in, const unsigned dim); \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    //INSTANTIATE(cfloat)
-    //INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+//INSTANTIATE(cfloat)
+//INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
 }
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index f07d585..f941534 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -23,68 +23,71 @@ using std::sort;
 
 namespace cpu
 {
-    ///////////////////////////////////////////////////////////////////////////
-    // Kernel Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T, bool isAscending>
-    void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
-    {
-        // initialize original index locations
-           uint *idx_ptr = idx.get();
-              T *val_ptr = val.get();
-        const T *in_ptr  = in.get();
-        function<bool(T, T)> op = greater<T>();
-        if(isAscending) { op = less<T>(); }
-
-        std::vector<uint> seq_vec(idx.dims()[0]);
-        std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
-        const T *comp_ptr = nullptr;
-        auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
-        for(dim_t w = 0; w < in.dims()[3]; w++) {
-            dim_t valW = w * val.strides()[3];
-            dim_t idxW = w * idx.strides()[3];
-            dim_t  inW = w *  in.strides()[3];
-            for(dim_t z = 0; z < in.dims()[2]; z++) {
-                dim_t valWZ = valW + z * val.strides()[2];
-                dim_t idxWZ = idxW + z * idx.strides()[2];
-                dim_t  inWZ =  inW + z *  in.strides()[2];
-                for(dim_t y = 0; y < in.dims()[1]; y++) {
-
-                    dim_t valOffset = valWZ + y * val.strides()[1];
-                    dim_t idxOffset = idxWZ + y * idx.strides()[1];
-                    dim_t inOffset  =  inWZ + y *  in.strides()[1];
-
-                    uint *ptr = idx_ptr + idxOffset;
-                    std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
-                    comp_ptr = in_ptr + inOffset;
-                    std::stable_sort(ptr, ptr + in.dims()[0], comparator);
-
-                    for (dim_t i = 0; i < val.dims()[0]; ++i){
-                        val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
-                    }
+
+///////////////////////////////////////////////////////////////////////////
+// Kernel Functions
+///////////////////////////////////////////////////////////////////////////
+template<typename T, bool isAscending>
+void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
+{
+    // initialize original index locations
+       uint *idx_ptr = idx.get();
+          T *val_ptr = val.get();
+    const T *in_ptr  = in.get();
+    function<bool(T, T)> op = greater<T>();
+    if(isAscending) { op = less<T>(); }
+
+    std::vector<uint> seq_vec(idx.dims()[0]);
+    std::iota(seq_vec.begin(), seq_vec.end(), 0);
+
+    const T *comp_ptr = nullptr;
+    auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
+
+    for(dim_t w = 0; w < in.dims()[3]; w++) {
+        dim_t valW = w * val.strides()[3];
+        dim_t idxW = w * idx.strides()[3];
+        dim_t  inW = w *  in.strides()[3];
+        for(dim_t z = 0; z < in.dims()[2]; z++) {
+            dim_t valWZ = valW + z * val.strides()[2];
+            dim_t idxWZ = idxW + z * idx.strides()[2];
+            dim_t  inWZ =  inW + z *  in.strides()[2];
+            for(dim_t y = 0; y < in.dims()[1]; y++) {
+
+                dim_t valOffset = valWZ + y * val.strides()[1];
+                dim_t idxOffset = idxWZ + y * idx.strides()[1];
+                dim_t inOffset  =  inWZ + y *  in.strides()[1];
+
+                uint *ptr = idx_ptr + idxOffset;
+                std::copy(seq_vec.begin(), seq_vec.end(), ptr);
+
+                comp_ptr = in_ptr + inOffset;
+                std::stable_sort(ptr, ptr + in.dims()[0], comparator);
+
+                for (dim_t i = 0; i < val.dims()[0]; ++i){
+                    val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
                 }
             }
         }
-
-        return;
     }
 
-    ///////////////////////////////////////////////////////////////////////////
-    // Wrapper Functions
-    ///////////////////////////////////////////////////////////////////////////
-    template<typename T, bool isAscending>
-    void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
-    {
-        val = createEmptyArray<T>(in.dims());
-        idx = createEmptyArray<uint>(in.dims());
-        switch(dim) {
-            case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
-        }
+    return;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Wrapper Functions
+///////////////////////////////////////////////////////////////////////////
+template<typename T, bool isAscending>
+void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
+{
+    in.eval();
+
+    val = createEmptyArray<T>(in.dims());
+    idx = createEmptyArray<uint>(in.dims());
+    switch(dim) {
+        case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
+        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+}
 
 #define INSTANTIATE(T)                                                  \
     template void sort_index<T, true>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
@@ -92,16 +95,17 @@ namespace cpu
     template void sort_index<T,false>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
                                       const uint dim);                  \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    //INSTANTIATE(cfloat)
-    //INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+//INSTANTIATE(cfloat)
+//INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
 }
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index e2c908c..c278908 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -106,6 +106,8 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
                const unsigned radius, const float diff_thr, const float geom_thr,
                const float feature_ratio, const unsigned edge)
 {
+    in.eval();
+
     dim4 idims = in.dims();
     const unsigned corner_lim = in.elements() * feature_ratio;
 
diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp
index 39cbb66..92912ca 100644
--- a/src/backend/cpu/svd.cpp
+++ b/src/backend/cpu/svd.cpp
@@ -30,101 +30,106 @@ namespace cpu
 
 #if defined(USE_MKL) || defined(__APPLE__)
 
-    template<typename T, typename Tr>
-    using svd_func_def = int (*)(ORDER_TYPE,
-                                 char jobz,
-                                 int m, int n,
-                                 T* in, int ldin,
-                                 Tr* s,
-                                 T* u, int ldu,
-                                 T* vt, int ldvt);
-
-    SVD_FUNC_DEF( gesdd )
-    SVD_FUNC(gesdd, float  , float , s)
-    SVD_FUNC(gesdd, double , double, d)
-    SVD_FUNC(gesdd, cfloat , float , c)
-    SVD_FUNC(gesdd, cdouble, double, z)
+template<typename T, typename Tr>
+using svd_func_def = int (*)(ORDER_TYPE,
+                             char jobz,
+                             int m, int n,
+                             T* in, int ldin,
+                             Tr* s,
+                             T* u, int ldu,
+                             T* vt, int ldvt);
+
+SVD_FUNC_DEF( gesdd )
+SVD_FUNC(gesdd, float  , float , s)
+SVD_FUNC(gesdd, double , double, d)
+SVD_FUNC(gesdd, cfloat , float , c)
+SVD_FUNC(gesdd, cdouble, double, z)
 
 #else   // Atlas causes memory freeing issues with using gesdd
 
-    template<typename T, typename Tr>
-    using svd_func_def = int (*)(ORDER_TYPE,
-                                 char jobu, char jobvt,
-                                 int m, int n,
-                                 T* in, int ldin,
-                                 Tr* s,
-                                 T* u, int ldu,
-                                 T* vt, int ldvt,
-                                 Tr *superb);
-
-    SVD_FUNC_DEF( gesvd )
-    SVD_FUNC(gesvd, float  , float , s)
-    SVD_FUNC(gesvd, double , double, d)
-    SVD_FUNC(gesvd, cfloat , float , c)
-    SVD_FUNC(gesvd, cdouble, double, z)
+template<typename T, typename Tr>
+using svd_func_def = int (*)(ORDER_TYPE,
+                             char jobu, char jobvt,
+                             int m, int n,
+                             T* in, int ldin,
+                             Tr* s,
+                             T* u, int ldu,
+                             T* vt, int ldvt,
+                             Tr *superb);
+
+SVD_FUNC_DEF( gesvd )
+SVD_FUNC(gesvd, float  , float , s)
+SVD_FUNC(gesvd, double , double, d)
+SVD_FUNC(gesvd, cfloat , float , c)
+SVD_FUNC(gesvd, cdouble, double, z)
 
 #endif
 
-    template <typename T, typename Tr>
-    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
-    {
-        s.eval();
-        u.eval();
-        vt.eval();
-        in.eval();
+template <typename T, typename Tr>
+void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+{
+    s.eval();
+    u.eval();
+    vt.eval();
+    in.eval();
 
-        auto func = [=] (Array<Tr> s, Array<T> u, Array<T> vt, Array<T> in) {
-            dim4 iDims = in.dims();
-            int M = iDims[0];
-            int N = iDims[1];
+    auto func = [=] (Array<Tr> s, Array<T> u, Array<T> vt, Array<T> in) {
+        dim4 iDims = in.dims();
+        int M = iDims[0];
+        int N = iDims[1];
 
 #if defined(USE_MKL) || defined(__APPLE__)
-            svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1],
-                    s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]);
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1],
+                s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]);
 #else
-            std::vector<Tr> superb(std::min(M, N));
-            svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1],
-                    s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]);
+        std::vector<Tr> superb(std::min(M, N));
+        svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1],
+                s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]);
 #endif
-        };
-        getQueue().enqueue(func, s, u, vt, in);
-    }
-
-    template <typename T, typename Tr>
-    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
-    {
-        Array<T> in_copy = copyArray<T>(in);
-        svdInPlace(s, u, vt, in_copy);
-    }
+    };
+    getQueue().enqueue(func, s, u, vt, in);
+}
+
+template <typename T, typename Tr>
+void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+{
+    Array<T> in_copy = copyArray<T>(in);
+    svdInPlace(s, u, vt, in_copy);
+}
+
 }
 
 #else
 
 namespace cpu
 {
-    template <typename T, typename Tr>
-    void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
-    {
-        AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
-    }
-
-    template <typename T, typename Tr>
-    void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
-    {
-        AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
-    }
+
+template <typename T, typename Tr>
+void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+{
+    AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
+}
+
+template <typename T, typename Tr>
+void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+{
+    AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
+}
+
 }
 
 #endif
 
-namespace cpu {
+namespace cpu
+{
 
 #define INSTANTIATE_SVD(T, Tr)                                          \
     template void svd<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, const Array<T> &in); \
     template void svdInPlace<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, Array<T> &in);
 
-    INSTANTIATE_SVD(float  , float )
-    INSTANTIATE_SVD(double , double)
-    INSTANTIATE_SVD(cfloat , float )
-    INSTANTIATE_SVD(cdouble, double)
+INSTANTIATE_SVD(float  , float )
+INSTANTIATE_SVD(double , double)
+INSTANTIATE_SVD(cfloat , float )
+INSTANTIATE_SVD(cdouble, double)
+
 }
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp
index f4a0514..a7287ce 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/transform.cpp
@@ -107,8 +107,10 @@ template<typename T>
 Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
                     const af_interp_type method, const bool inverse)
 {
-    Array<T> out = createEmptyArray<T>(odims);
     in.eval();
+    transform.eval();
+
+    Array<T> out = createEmptyArray<T>(odims);
 
     switch(method) {
         case AF_INTERP_NEAREST :
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp
index c3a8a37..7e7eec1 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/transpose.cpp
@@ -171,5 +171,4 @@ INSTANTIATE(uintl  )
 INSTANTIATE(short)
 INSTANTIATE(ushort)
 
-
 }
diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp
index ed7f348..13bee16 100644
--- a/src/backend/cpu/triangle.cpp
+++ b/src/backend/cpu/triangle.cpp
@@ -66,6 +66,7 @@ void triangle(Array<T> &out, const Array<T> &in)
 template<typename T, bool is_upper, bool is_unit_diag>
 Array<T> triangle(const Array<T> &in)
 {
+    in.eval();
     Array<T> out = createEmptyArray<T>(in.dims());
     triangle<T, is_upper, is_unit_diag>(out, in);
     return out;
@@ -81,17 +82,17 @@ Array<T> triangle(const Array<T> &in)
     template Array<T> triangle<T, true , false>(const Array<T> &in);    \
     template Array<T> triangle<T, false, false>(const Array<T> &in);    \
 
-    INSTANTIATE(float)
-    INSTANTIATE(double)
-    INSTANTIATE(cfloat)
-    INSTANTIATE(cdouble)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
-    INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(short)
-    INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
 
 }
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index efb46be..41423c7 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -83,8 +83,9 @@ template<typename T>
 Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
 {
-    af::dim4 idims = in.dims();
+    in.eval();
 
+    af::dim4 idims = in.dims();
     dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
     dim_t ny = (idims[1] + 2 * py - wy) / sy + 1;
 
diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp
index e6a4817..441c7ff 100644
--- a/src/backend/cpu/where.cpp
+++ b/src/backend/cpu/where.cpp
@@ -23,61 +23,62 @@ using af::dim4;
 
 namespace cpu
 {
-    template<typename T>
-    Array<uint> where(const Array<T> &in)
-    {
-        evalArray(in);
-        getQueue().sync();
 
-        const dim_t *dims    = in.dims().get();
-        const dim_t *strides = in.strides().get();
-        static const T zero = scalar<T>(0);
+template<typename T>
+Array<uint> where(const Array<T> &in)
+{
+    evalArray(in);
+    getQueue().sync();
+
+    const dim_t *dims    = in.dims().get();
+    const dim_t *strides = in.strides().get();
+    static const T zero = scalar<T>(0);
 
-        const T *iptr = in.get();
-        uint *out_vec  = memAlloc<uint>(in.elements());
+    const T *iptr = in.get();
+    uint *out_vec  = memAlloc<uint>(in.elements());
 
-        dim_t count = 0;
-        dim_t idx = 0;
-        for (dim_t w = 0; w < dims[3]; w++) {
-            uint offw = w * strides[3];
+    dim_t count = 0;
+    dim_t idx = 0;
+    for (dim_t w = 0; w < dims[3]; w++) {
+        uint offw = w * strides[3];
 
-            for (dim_t z = 0; z < dims[2]; z++) {
-                uint offz = offw + z * strides[2];
+        for (dim_t z = 0; z < dims[2]; z++) {
+            uint offz = offw + z * strides[2];
 
-                for (dim_t y = 0; y < dims[1]; y++) {
-                    uint offy = y * strides[1] + offz;
+            for (dim_t y = 0; y < dims[1]; y++) {
+                uint offy = y * strides[1] + offz;
 
-                    for (dim_t x = 0; x < dims[0]; x++) {
+                for (dim_t x = 0; x < dims[0]; x++) {
 
-                        T val = iptr[offy + x];
-                        if (val != zero) {
-                            out_vec[count] = idx;
-                            count++;
-                        }
-                        idx++;
+                    T val = iptr[offy + x];
+                    if (val != zero) {
+                        out_vec[count] = idx;
+                        count++;
                     }
+                    idx++;
                 }
             }
         }
-
-        Array<uint> out = createDeviceDataArray<uint>(dim4(count), out_vec);
-        return out;
     }
 
+    Array<uint> out = createDeviceDataArray<uint>(dim4(count), out_vec);
+    return out;
+}
+
 #define INSTANTIATE(T)                                  \
     template Array<uint> where<T>(const Array<T> &in);    \
 
-    INSTANTIATE(float  )
-    INSTANTIATE(cfloat )
-    INSTANTIATE(double )
-    INSTANTIATE(cdouble)
-    INSTANTIATE(char   )
-    INSTANTIATE(int    )
-    INSTANTIATE(uint   )
-    INSTANTIATE(intl   )
-    INSTANTIATE(uintl  )
-    INSTANTIATE(uchar  )
-    INSTANTIATE(short  )
-    INSTANTIATE(ushort )
+INSTANTIATE(float  )
+INSTANTIATE(cfloat )
+INSTANTIATE(double )
+INSTANTIATE(cdouble)
+INSTANTIATE(char   )
+INSTANTIATE(int    )
+INSTANTIATE(uint   )
+INSTANTIATE(intl   )
+INSTANTIATE(uintl  )
+INSTANTIATE(uchar  )
+INSTANTIATE(short  )
+INSTANTIATE(ushort )
 
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git