[arrayfire] 97/284: Moved more cpu fns implementations to kernel namespace
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Sun Feb 7 18:59:23 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.
commit bb0a22c6f6ddc1d3b2f07f1cf322722fa595c17c
Author: pradeep <pradeep at arrayfire.com>
Date: Sat Dec 19 14:06:37 2015 -0500
Moved more cpu fns implementations to kernel namespace
Below given is the list of functions that have undergone this change:
* iota
* ireduce
* join
* lu decomposition
* template matching
* mean shift
* median filter
* morphological operations
---
src/backend/cpu/iota.cpp | 38 +-------
src/backend/cpu/ireduce.cpp | 100 ++------------------
src/backend/cpu/join.cpp | 148 +++---------------------------
src/backend/cpu/kernel/iota.hpp | 45 +++++++++
src/backend/cpu/kernel/ireduce.hpp | 108 ++++++++++++++++++++++
src/backend/cpu/kernel/join.hpp | 144 +++++++++++++++++++++++++++++
src/backend/cpu/kernel/lu.hpp | 80 ++++++++++++++++
src/backend/cpu/kernel/match_template.hpp | 141 ++++++++++++++++++++++++++++
src/backend/cpu/kernel/meanshift.hpp | 138 ++++++++++++++++++++++++++++
src/backend/cpu/kernel/medfilt.hpp | 135 +++++++++++++++++++++++++++
src/backend/cpu/kernel/morph.hpp | 140 ++++++++++++++++++++++++++++
src/backend/cpu/lu.cpp | 66 +------------
src/backend/cpu/match_template.cpp | 127 +------------------------
src/backend/cpu/meanshift.cpp | 121 +-----------------------
src/backend/cpu/medfilt.cpp | 117 +----------------------
src/backend/cpu/morph.cpp | 126 +------------------------
src/backend/cpu/utility.hpp | 4 +-
17 files changed, 971 insertions(+), 807 deletions(-)
diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp
index dcb85fa..41f0c9c 100644
--- a/src/backend/cpu/iota.cpp
+++ b/src/backend/cpu/iota.cpp
@@ -10,49 +10,15 @@
#include <Array.hpp>
#include <iota.hpp>
#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <algorithm>
-#include <numeric>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/iota.hpp>
using namespace std;
namespace cpu
{
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename T>
-void iota_(Array<T> output, const dim4 &sdims, const dim4 &tdims)
-{
- const dim4 dims = output.dims();
- T* out = output.get();
- const dim4 strides = output.strides();
-
- for(dim_t w = 0; w < dims[3]; w++) {
- dim_t offW = w * strides[3];
- T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2];
- for(dim_t z = 0; z < dims[2]; z++) {
- dim_t offWZ = offW + z * strides[2];
- T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1];
- for(dim_t y = 0; y < dims[1]; y++) {
- dim_t offWZY = offWZ + y * strides[1];
- T valY = valZ + (y % sdims[1]) * sdims[0];
- for(dim_t x = 0; x < dims[0]; x++) {
- dim_t id = offWZY + x;
- out[id] = valY + (x % sdims[0]);
- }
- }
- }
- }
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
template<typename T>
Array<T> iota(const dim4 &dims, const dim4 &tile_dims)
{
@@ -60,7 +26,7 @@ Array<T> iota(const dim4 &dims, const dim4 &tile_dims)
Array<T> out = createEmptyArray<T>(outdims);
- getQueue().enqueue(iota_<T>, out, dims, tile_dims);
+ getQueue().enqueue(kernel::iota<T>, out, dims, tile_dims);
return out;
}
diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp
index 9858cba..f1efcf6 100644
--- a/src/backend/cpu/ireduce.cpp
+++ b/src/backend/cpu/ireduce.cpp
@@ -13,103 +13,15 @@
#include <ArrayInfo.hpp>
#include <Array.hpp>
#include <ireduce.hpp>
-
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/ireduce.hpp>
using af::dim4;
namespace cpu
{
-template<typename T> double cabs(const T in) { return (double)in; }
-static double cabs(const char in) { return (double)(in > 0); }
-static double cabs(const cfloat &in) { return (double)abs(in); }
-static double cabs(const cdouble &in) { return (double)abs(in); }
-
-template<af_op_t op, typename T>
-struct MinMaxOp
-{
- T m_val;
- uint m_idx;
- MinMaxOp(T val, uint idx) :
- m_val(val), m_idx(idx)
- {
- }
-
- void operator()(T val, uint idx)
- {
- if (cabs(val) < cabs(m_val) ||
- (cabs(val) == cabs(m_val) &&
- idx > m_idx)) {
- m_val = val;
- m_idx = idx;
- }
- }
-};
-
-template<typename T>
-struct MinMaxOp<af_max_t, T>
-{
- T m_val;
- uint m_idx;
- MinMaxOp(T val, uint idx) :
- m_val(val), m_idx(idx)
- {
- }
-
- void operator()(T val, uint idx)
- {
- if (cabs(val) > cabs(m_val) ||
- (cabs(val) == cabs(m_val) &&
- idx <= m_idx)) {
- m_val = val;
- m_idx = idx;
- }
- }
-};
-
-template<af_op_t op, typename T, int D>
-struct ireduce_dim
-{
- void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
- const Array<T> input, const dim_t inOffset, const int dim)
- {
- const dim4 odims = output.dims();
- const dim4 ostrides = output.strides();
- const dim4 istrides = input.strides();
- const int D1 = D - 1;
- for (dim_t i = 0; i < odims[D1]; i++) {
- ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
- input, inOffset + i * istrides[D1], dim);
- }
- }
-};
-
-template<af_op_t op, typename T>
-struct ireduce_dim<op, T, 0>
-{
- void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
- const Array<T> input, const dim_t inOffset, const int dim)
- {
- const dim4 idims = input.dims();
- const dim4 istrides = input.strides();
-
- T const * const in = input.get();
- T * out = output.get();
- uint * loc = locArray.get();
-
- dim_t stride = istrides[dim];
- MinMaxOp<op, T> Op(in[0], 0);
- for (dim_t i = 0; i < idims[dim]; i++) {
- Op(in[inOffset + i * stride], i);
- }
-
- *(out+outOffset) = Op.m_val;
- *(loc+outOffset) = Op.m_idx;
- }
-};
-
template<af_op_t op, typename T>
using ireduce_dim_func = std::function<void(Array<T>, Array<uint>, const dim_t,
const Array<T>, const dim_t, const int)>;
@@ -123,10 +35,10 @@ void ireduce(Array<T> &out, Array<uint> &loc, const Array<T> &in, const int dim)
dim4 odims = in.dims();
odims[dim] = 1;
- static const ireduce_dim_func<op, T> ireduce_funcs[] = { ireduce_dim<op, T, 1>()
- , ireduce_dim<op, T, 2>()
- , ireduce_dim<op, T, 3>()
- , ireduce_dim<op, T, 4>()};
+ static const ireduce_dim_func<op, T> ireduce_funcs[] = { kernel::ireduce_dim<op, T, 1>()
+ , kernel::ireduce_dim<op, T, 2>()
+ , kernel::ireduce_dim<op, T, 3>()
+ , kernel::ireduce_dim<op, T, 4>()};
getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim);
}
@@ -141,7 +53,7 @@ T ireduce_all(unsigned *loc, const Array<T> &in)
af::dim4 strides = in.strides();
const T *inPtr = in.get();
- MinMaxOp<op, T> Op(inPtr[0], 0);
+ kernel::MinMaxOp<op, T> Op(inPtr[0], 0);
for(dim_t l = 0; l < dims[3]; l++) {
dim_t off3 = l * strides[3];
diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp
index 8af9c24..e39280c 100644
--- a/src/backend/cpu/join.cpp
+++ b/src/backend/cpu/join.cpp
@@ -9,50 +9,12 @@
#include <Array.hpp>
#include <join.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/join.hpp>
namespace cpu
{
-template<typename To, typename Tx, int dim>
-void join_append(To *out, const Tx *X, const af::dim4 &offset,
- const af::dim4 &odims, const af::dim4 &xdims,
- const af::dim4 &ost, const af::dim4 &xst)
-{
- for(dim_t ow = 0; ow < xdims[3]; ow++) {
- const dim_t xW = ow * xst[3];
- const dim_t oW = (ow + offset[3]) * ost[3];
-
- for(dim_t oz = 0; oz < xdims[2]; oz++) {
- const dim_t xZW = xW + oz * xst[2];
- const dim_t oZW = oW + (oz + offset[2]) * ost[2];
-
- for(dim_t oy = 0; oy < xdims[1]; oy++) {
- const dim_t xYZW = xZW + oy * xst[1];
- const dim_t oYZW = oZW + (oy + offset[1]) * ost[1];
-
- for(dim_t ox = 0; ox < xdims[0]; ox++) {
- const dim_t iMem = xYZW + ox;
- const dim_t oMem = oYZW + (ox + offset[0]);
- out[oMem] = X[iMem];
- }
- }
- }
- }
-}
-
-template<int dim>
-af::dim4 calcOffset(const af::dim4 dims)
-{
- af::dim4 offset;
- offset[0] = (dim == 0) ? dims[0] : 0;
- offset[1] = (dim == 1) ? dims[1] : 0;
- offset[2] = (dim == 2) ? dims[2] : 0;
- offset[3] = (dim == 3) ? dims[3] : 0;
- return offset;
-}
template<typename Tx, typename Ty>
Array<Tx> join(const int dim, const Array<Tx> &first, const Array<Ty> &second)
@@ -76,97 +38,15 @@ Array<Tx> join(const int dim, const Array<Tx> &first, const Array<Ty> &second)
Array<Tx> out = createEmptyArray<Tx>(odims);
- auto func = [=] (Array<Tx> out, const Array<Tx> first, const Array<Ty> second) {
- Tx* outPtr = out.get();
- const Tx* fptr = first.get();
- const Ty* sptr = second.get();
-
- af::dim4 zero(0,0,0,0);
- const af::dim4 odims = out.dims();
- const af::dim4 fdims = first.dims();
- const af::dim4 sdims = second.dims();
-
- switch(dim) {
- case 0:
- join_append<Tx, Tx, 0>(outPtr, fptr, zero,
- odims, fdims, out.strides(), first.strides());
- join_append<Tx, Ty, 0>(outPtr, sptr, calcOffset<0>(fdims),
- odims, sdims, out.strides(), second.strides());
- break;
- case 1:
- join_append<Tx, Tx, 1>(outPtr, fptr, zero,
- odims, fdims, out.strides(), first.strides());
- join_append<Tx, Ty, 1>(outPtr, sptr, calcOffset<1>(fdims),
- odims, sdims, out.strides(), second.strides());
- break;
- case 2:
- join_append<Tx, Tx, 2>(outPtr, fptr, zero,
- odims, fdims, out.strides(), first.strides());
- join_append<Tx, Ty, 2>(outPtr, sptr, calcOffset<2>(fdims),
- odims, sdims, out.strides(), second.strides());
- break;
- case 3:
- join_append<Tx, Tx, 3>(outPtr, fptr, zero,
- odims, fdims, out.strides(), first.strides());
- join_append<Tx, Ty, 3>(outPtr, sptr, calcOffset<3>(fdims),
- odims, sdims, out.strides(), second.strides());
- break;
- }
- };
- getQueue().enqueue(func, out, first, second);
+ getQueue().enqueue(kernel::join<Tx, Ty>, out, dim, first, second);
return out;
}
-template<typename T, int n_arrays>
-void join_wrapper(const int dim, Array<T> out, const std::vector<Array<T>> inputs)
-{
- af::dim4 zero(0,0,0,0);
- af::dim4 d = zero;
- switch(dim) {
- case 0:
- join_append<T, T, 0>(out.get(), inputs[0].get(), zero,
- out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
- for(int i = 1; i < n_arrays; i++) {
- d += inputs[i - 1].dims();
- join_append<T, T, 0>(out.get(), inputs[i].get(), calcOffset<0>(d),
- out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
- }
- break;
- case 1:
- join_append<T, T, 1>(out.get(), inputs[0].get(), zero,
- out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
- for(int i = 1; i < n_arrays; i++) {
- d += inputs[i - 1].dims();
- join_append<T, T, 1>(out.get(), inputs[i].get(), calcOffset<1>(d),
- out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
- }
- break;
- case 2:
- join_append<T, T, 2>(out.get(), inputs[0].get(), zero,
- out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
- for(int i = 1; i < n_arrays; i++) {
- d += inputs[i - 1].dims();
- join_append<T, T, 2>(out.get(), inputs[i].get(), calcOffset<2>(d),
- out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
- }
- break;
- case 3:
- join_append<T, T, 3>(out.get(), inputs[0].get(), zero,
- out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
- for(int i = 1; i < n_arrays; i++) {
- d += inputs[i - 1].dims();
- join_append<T, T, 3>(out.get(), inputs[i].get(), calcOffset<3>(d),
- out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
- }
- break;
- }
-}
-
template<typename T>
Array<T> join(const int dim, const std::vector<Array<T>> &inputs)
{
- for (int i=0; i<inputs.size(); ++i)
+ for (unsigned i=0; i<inputs.size(); ++i)
inputs[i].eval();
// All dimensions except join dimension must be equal
// Compute output dims
@@ -175,7 +55,7 @@ Array<T> join(const int dim, const std::vector<Array<T>> &inputs)
std::vector<af::dim4> idims(n_arrays);
dim_t dim_size = 0;
- for(int i = 0; i < (int)idims.size(); i++) {
+ for(unsigned i = 0; i < idims.size(); i++) {
idims[i] = inputs[i].dims();
dim_size += idims[i][dim];
}
@@ -192,34 +72,34 @@ Array<T> join(const int dim, const std::vector<Array<T>> &inputs)
switch(n_arrays) {
case 1:
- getQueue().enqueue(join_wrapper<T, 1>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 1>, dim, out, inputs);
break;
case 2:
- getQueue().enqueue(join_wrapper<T, 2>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 2>, dim, out, inputs);
break;
case 3:
- getQueue().enqueue(join_wrapper<T, 3>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 3>, dim, out, inputs);
break;
case 4:
- getQueue().enqueue(join_wrapper<T, 4>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 4>, dim, out, inputs);
break;
case 5:
- getQueue().enqueue(join_wrapper<T, 5>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 5>, dim, out, inputs);
break;
case 6:
- getQueue().enqueue(join_wrapper<T, 6>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 6>, dim, out, inputs);
break;
case 7:
- getQueue().enqueue(join_wrapper<T, 7>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 7>, dim, out, inputs);
break;
case 8:
- getQueue().enqueue(join_wrapper<T, 8>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 8>, dim, out, inputs);
break;
case 9:
- getQueue().enqueue(join_wrapper<T, 9>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T, 9>, dim, out, inputs);
break;
case 10:
- getQueue().enqueue(join_wrapper<T,10>, dim, out, inputs);
+ getQueue().enqueue(kernel::join<T,10>, dim, out, inputs);
break;
}
diff --git a/src/backend/cpu/kernel/iota.hpp b/src/backend/cpu/kernel/iota.hpp
new file mode 100644
index 0000000..0f82429
--- /dev/null
+++ b/src/backend/cpu/kernel/iota.hpp
@@ -0,0 +1,45 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void iota(Array<T> output, const af::dim4 &sdims, const af::dim4 &tdims)
+{
+ const af::dim4 dims = output.dims();
+ T* out = output.get();
+ const af::dim4 strides = output.strides();
+
+ for(dim_t w = 0; w < dims[3]; w++) {
+ dim_t offW = w * strides[3];
+ T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2];
+ for(dim_t z = 0; z < dims[2]; z++) {
+ dim_t offWZ = offW + z * strides[2];
+ T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1];
+ for(dim_t y = 0; y < dims[1]; y++) {
+ dim_t offWZY = offWZ + y * strides[1];
+ T valY = valZ + (y % sdims[1]) * sdims[0];
+ for(dim_t x = 0; x < dims[0]; x++) {
+ dim_t id = offWZY + x;
+ out[id] = valY + (x % sdims[0]);
+ }
+ }
+ }
+ }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/ireduce.hpp b/src/backend/cpu/kernel/ireduce.hpp
new file mode 100644
index 0000000..1f5a51d
--- /dev/null
+++ b/src/backend/cpu/kernel/ireduce.hpp
@@ -0,0 +1,108 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T> double cabs(const T in) { return (double)in; }
+static double cabs(const char in) { return (double)(in > 0); }
+static double cabs(const cfloat &in) { return (double)abs(in); }
+static double cabs(const cdouble &in) { return (double)abs(in); }
+
+template<af_op_t op, typename T>
+struct MinMaxOp
+{
+ T m_val;
+ uint m_idx;
+ MinMaxOp(T val, uint idx) :
+ m_val(val), m_idx(idx)
+ {
+ }
+
+ void operator()(T val, uint idx)
+ {
+ if (cabs(val) < cabs(m_val) ||
+ (cabs(val) == cabs(m_val) &&
+ idx > m_idx)) {
+ m_val = val;
+ m_idx = idx;
+ }
+ }
+};
+
+template<typename T>
+struct MinMaxOp<af_max_t, T>
+{
+ T m_val;
+ uint m_idx;
+ MinMaxOp(T val, uint idx) :
+ m_val(val), m_idx(idx)
+ {
+ }
+
+ void operator()(T val, uint idx)
+ {
+ if (cabs(val) > cabs(m_val) ||
+ (cabs(val) == cabs(m_val) &&
+ idx <= m_idx)) {
+ m_val = val;
+ m_idx = idx;
+ }
+ }
+};
+
+template<af_op_t op, typename T, int D>
+struct ireduce_dim
+{
+ void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+ const Array<T> input, const dim_t inOffset, const int dim)
+ {
+ const af::dim4 odims = output.dims();
+ const af::dim4 ostrides = output.strides();
+ const af::dim4 istrides = input.strides();
+ const int D1 = D - 1;
+ for (dim_t i = 0; i < odims[D1]; i++) {
+ ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
+ input, inOffset + i * istrides[D1], dim);
+ }
+ }
+};
+
+template<af_op_t op, typename T>
+struct ireduce_dim<op, T, 0>
+{
+ void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+ const Array<T> input, const dim_t inOffset, const int dim)
+ {
+ const af::dim4 idims = input.dims();
+ const af::dim4 istrides = input.strides();
+
+ T const * const in = input.get();
+ T * out = output.get();
+ uint * loc = locArray.get();
+
+ dim_t stride = istrides[dim];
+ MinMaxOp<op, T> Op(in[0], 0);
+ for (dim_t i = 0; i < idims[dim]; i++) {
+ Op(in[inOffset + i * stride], i);
+ }
+
+ *(out+outOffset) = Op.m_val;
+ *(loc+outOffset) = Op.m_idx;
+ }
+};
+
+}
+}
diff --git a/src/backend/cpu/kernel/join.hpp b/src/backend/cpu/kernel/join.hpp
new file mode 100644
index 0000000..b0d92c9
--- /dev/null
+++ b/src/backend/cpu/kernel/join.hpp
@@ -0,0 +1,144 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<int dim>
+af::dim4 calcOffset(const af::dim4 dims)
+{
+ af::dim4 offset;
+ offset[0] = (dim == 0) ? dims[0] : 0;
+ offset[1] = (dim == 1) ? dims[1] : 0;
+ offset[2] = (dim == 2) ? dims[2] : 0;
+ offset[3] = (dim == 3) ? dims[3] : 0;
+ return offset;
+}
+
+template<typename To, typename Tx, int dim>
+void join_append(To *out, const Tx *X, const af::dim4 &offset,
+ const af::dim4 &odims, const af::dim4 &xdims,
+ const af::dim4 &ost, const af::dim4 &xst)
+{
+ for(dim_t ow = 0; ow < xdims[3]; ow++) {
+ const dim_t xW = ow * xst[3];
+ const dim_t oW = (ow + offset[3]) * ost[3];
+
+ for(dim_t oz = 0; oz < xdims[2]; oz++) {
+ const dim_t xZW = xW + oz * xst[2];
+ const dim_t oZW = oW + (oz + offset[2]) * ost[2];
+
+ for(dim_t oy = 0; oy < xdims[1]; oy++) {
+ const dim_t xYZW = xZW + oy * xst[1];
+ const dim_t oYZW = oZW + (oy + offset[1]) * ost[1];
+
+ for(dim_t ox = 0; ox < xdims[0]; ox++) {
+ const dim_t iMem = xYZW + ox;
+ const dim_t oMem = oYZW + (ox + offset[0]);
+ out[oMem] = X[iMem];
+ }
+ }
+ }
+ }
+}
+
+template<typename Tx, typename Ty>
+void join(Array<Tx> out, const int dim, const Array<Tx> first, const Array<Ty> second)
+{
+ Tx* outPtr = out.get();
+ const Tx* fptr = first.get();
+ const Ty* sptr = second.get();
+
+ af::dim4 zero(0,0,0,0);
+ const af::dim4 odims = out.dims();
+ const af::dim4 fdims = first.dims();
+ const af::dim4 sdims = second.dims();
+
+ switch(dim) {
+ case 0:
+ join_append<Tx, Tx, 0>(outPtr, fptr, zero,
+ odims, fdims, out.strides(), first.strides());
+ join_append<Tx, Ty, 0>(outPtr, sptr, calcOffset<0>(fdims),
+ odims, sdims, out.strides(), second.strides());
+ break;
+ case 1:
+ join_append<Tx, Tx, 1>(outPtr, fptr, zero,
+ odims, fdims, out.strides(), first.strides());
+ join_append<Tx, Ty, 1>(outPtr, sptr, calcOffset<1>(fdims),
+ odims, sdims, out.strides(), second.strides());
+ break;
+ case 2:
+ join_append<Tx, Tx, 2>(outPtr, fptr, zero,
+ odims, fdims, out.strides(), first.strides());
+ join_append<Tx, Ty, 2>(outPtr, sptr, calcOffset<2>(fdims),
+ odims, sdims, out.strides(), second.strides());
+ break;
+ case 3:
+ join_append<Tx, Tx, 3>(outPtr, fptr, zero,
+ odims, fdims, out.strides(), first.strides());
+ join_append<Tx, Ty, 3>(outPtr, sptr, calcOffset<3>(fdims),
+ odims, sdims, out.strides(), second.strides());
+ break;
+ }
+}
+
+template<typename T, int n_arrays>
+void join(const int dim, Array<T> out, const std::vector<Array<T>> inputs)
+{
+ af::dim4 zero(0,0,0,0);
+ af::dim4 d = zero;
+ switch(dim) {
+ case 0:
+ join_append<T, T, 0>(out.get(), inputs[0].get(), zero,
+ out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+ for(int i = 1; i < n_arrays; i++) {
+ d += inputs[i - 1].dims();
+ join_append<T, T, 0>(out.get(), inputs[i].get(), calcOffset<0>(d),
+ out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+ }
+ break;
+ case 1:
+ join_append<T, T, 1>(out.get(), inputs[0].get(), zero,
+ out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+ for(int i = 1; i < n_arrays; i++) {
+ d += inputs[i - 1].dims();
+ join_append<T, T, 1>(out.get(), inputs[i].get(), calcOffset<1>(d),
+ out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+ }
+ break;
+ case 2:
+ join_append<T, T, 2>(out.get(), inputs[0].get(), zero,
+ out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+ for(int i = 1; i < n_arrays; i++) {
+ d += inputs[i - 1].dims();
+ join_append<T, T, 2>(out.get(), inputs[i].get(), calcOffset<2>(d),
+ out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+ }
+ break;
+ case 3:
+ join_append<T, T, 3>(out.get(), inputs[0].get(), zero,
+ out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides());
+ for(int i = 1; i < n_arrays; i++) {
+ d += inputs[i - 1].dims();
+ join_append<T, T, 3>(out.get(), inputs[i].get(), calcOffset<3>(d),
+ out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides());
+ }
+ break;
+ }
+}
+
+}
+}
+
diff --git a/src/backend/cpu/kernel/lu.hpp b/src/backend/cpu/kernel/lu.hpp
new file mode 100644
index 0000000..35b0c19
--- /dev/null
+++ b/src/backend/cpu/kernel/lu.hpp
@@ -0,0 +1,80 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void lu_split(Array<T> lower, Array<T> upper, const Array<T> in)
+{
+ T *l = lower.get();
+ T *u = upper.get();
+ const T *i = in.get();
+
+ af::dim4 ldm = lower.dims();
+ af::dim4 udm = upper.dims();
+ af::dim4 idm = in.dims();
+ af::dim4 lst = lower.strides();
+ af::dim4 ust = upper.strides();
+ af::dim4 ist = in.strides();
+
+ for(dim_t ow = 0; ow < idm[3]; ow++) {
+ const dim_t lW = ow * lst[3];
+ const dim_t uW = ow * ust[3];
+ const dim_t iW = ow * ist[3];
+
+ for(dim_t oz = 0; oz < idm[2]; oz++) {
+ const dim_t lZW = lW + oz * lst[2];
+ const dim_t uZW = uW + oz * ust[2];
+ const dim_t iZW = iW + oz * ist[2];
+
+ for(dim_t oy = 0; oy < idm[1]; oy++) {
+ const dim_t lYZW = lZW + oy * lst[1];
+ const dim_t uYZW = uZW + oy * ust[1];
+ const dim_t iYZW = iZW + oy * ist[1];
+
+ for(dim_t ox = 0; ox < idm[0]; ox++) {
+ const dim_t lMem = lYZW + ox;
+ const dim_t uMem = uYZW + ox;
+ const dim_t iMem = iYZW + ox;
+ if(ox > oy) {
+ if(oy < ldm[1]) l[lMem] = i[iMem];
+ if(ox < udm[0]) u[uMem] = scalar<T>(0);
+ } else if (oy > ox) {
+ if(oy < ldm[1]) l[lMem] = scalar<T>(0);
+ if(ox < udm[0]) u[uMem] = i[iMem];
+ } else if(ox == oy) {
+ if(oy < ldm[1]) l[lMem] = scalar<T>(1.0);
+ if(ox < udm[0]) u[uMem] = i[iMem];
+ }
+ }
+ }
+ }
+ }
+}
+
+void convertPivot(Array<int> p, Array<int> pivot)
+{
+ int *d_pi = pivot.get();
+ int *d_po = p.get();
+ dim_t d0 = pivot.dims()[0];
+ for(int j = 0; j < (int)d0; j++) {
+ // 1 indexed in pivot
+ std::swap(d_po[j], d_po[d_pi[j] - 1]);
+ }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/match_template.hpp b/src/backend/cpu/kernel/match_template.hpp
new file mode 100644
index 0000000..ae41364
--- /dev/null
+++ b/src/backend/cpu/kernel/match_template.hpp
@@ -0,0 +1,141 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename OutT, typename InT, af_match_type MatchT>
+void matchTemplate(Array<OutT> out, const Array<InT> sImg, const Array<InT> tImg)
+{
+ const af::dim4 sDims = sImg.dims();
+ const af::dim4 tDims = tImg.dims();
+ const af::dim4 sStrides = sImg.strides();
+ const af::dim4 tStrides = tImg.strides();
+
+ const dim_t tDim0 = tDims[0];
+ const dim_t tDim1 = tDims[1];
+ const dim_t sDim0 = sDims[0];
+ const dim_t sDim1 = sDims[1];
+
+ const af::dim4 oStrides = out.strides();
+
+ OutT tImgMean = OutT(0);
+ dim_t winNumElements = tImg.elements();
+ bool needMean = MatchT==AF_ZSAD || MatchT==AF_LSAD ||
+ MatchT==AF_ZSSD || MatchT==AF_LSSD ||
+ MatchT==AF_ZNCC;
+ const InT * tpl = tImg.get();
+
+ if (needMean) {
+ for(dim_t tj=0; tj<tDim1; tj++) {
+ dim_t tjStride = tj*tStrides[1];
+
+ for(dim_t ti=0; ti<tDim0; ti++) {
+ tImgMean += (OutT)tpl[tjStride+ti*tStrides[0]];
+ }
+ }
+ tImgMean /= winNumElements;
+ }
+
+ OutT * dst = out.get();
+ const InT * src = sImg.get();
+
+ for(dim_t b3=0; b3<sDims[3]; ++b3) {
+ for(dim_t b2=0; b2<sDims[2]; ++b2) {
+
+ // slide through image window after window
+ for(dim_t sj=0; sj<sDim1; sj++) {
+
+ dim_t ojStride = sj*oStrides[1];
+
+ for(dim_t si=0; si<sDim0; si++) {
+ OutT disparity = OutT(0);
+
+ // mean for window
+ // this variable will be used based on MatchT value
+ OutT wImgMean = OutT(0);
+ if (needMean) {
+ for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
+ dim_t jStride = j*sStrides[1];
+
+ for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
+ InT sVal = ((j<sDim1 && i<sDim0) ?
+ src[jStride + i*sStrides[0]] : InT(0));
+ wImgMean += (OutT)sVal;
+ }
+ }
+ wImgMean /= winNumElements;
+ }
+
+ // run the window match metric
+ for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
+ dim_t jStride = j*sStrides[1];
+ dim_t tjStride = tj*tStrides[1];
+
+ for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
+ InT sVal = ((j<sDim1 && i<sDim0) ?
+ src[jStride + i*sStrides[0]] : InT(0));
+ InT tVal = tpl[tjStride+ti*tStrides[0]];
+ OutT temp;
+ switch(MatchT) {
+ case AF_SAD:
+ disparity += fabs((OutT)sVal-(OutT)tVal);
+ break;
+ case AF_ZSAD:
+ disparity += fabs((OutT)sVal - wImgMean -
+ (OutT)tVal + tImgMean);
+ break;
+ case AF_LSAD:
+ disparity += fabs((OutT)sVal-(wImgMean/tImgMean)*tVal);
+ break;
+ case AF_SSD:
+ disparity += ((OutT)sVal-(OutT)tVal)*((OutT)sVal-(OutT)tVal);
+ break;
+ case AF_ZSSD:
+ temp = ((OutT)sVal - wImgMean - (OutT)tVal + tImgMean);
+ disparity += temp*temp;
+ break;
+ case AF_LSSD:
+ temp = ((OutT)sVal-(wImgMean/tImgMean)*tVal);
+ disparity += temp*temp;
+ break;
+ case AF_NCC:
+ //TODO: furture implementation
+ break;
+ case AF_ZNCC:
+ //TODO: furture implementation
+ break;
+ case AF_SHD:
+ //TODO: furture implementation
+ break;
+ }
+ }
+ }
+ // output is just created, hence not doing the
+ // extra multiplication for 0th dim stride
+ dst[ojStride + si] = disparity;
+ }
+ }
+ src += sStrides[2];
+ dst += oStrides[2];
+ }
+ src += sStrides[3];
+ dst += oStrides[3];
+ }
+};
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/meanshift.hpp b/src/backend/cpu/kernel/meanshift.hpp
new file mode 100644
index 0000000..f173b2f
--- /dev/null
+++ b/src/backend/cpu/kernel/meanshift.hpp
@@ -0,0 +1,138 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <vector>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool IsColor>
+void meanShift(Array<T> out, const Array<T> in, const float s_sigma,
+ const float c_sigma, const unsigned iter)
+{
+ const af::dim4 dims = in.dims();
+ const af::dim4 istrides = in.strides();
+ const af::dim4 ostrides = out.strides();
+
+ const dim_t bCount = (IsColor ? 1 : dims[2]);
+ const dim_t channels = (IsColor ? dims[2] : 1);
+
+ // clamp spatical and chromatic sigma's
+ float space_ = std::min(11.5f, s_sigma);
+ const dim_t radius = std::max((int)(space_ * 1.5f), 1);
+ const float cvar = c_sigma*c_sigma;
+
+ std::vector<float> means;
+ std::vector<float> centers;
+ std::vector<float> tmpclrs;
+ means.reserve(channels);
+ centers.reserve(channels);
+ tmpclrs.reserve(channels);
+
+ T *outData = out.get();
+ const T * inData = in.get();
+
+ for(dim_t b3=0; b3<dims[3]; ++b3) {
+ for(dim_t b2=0; b2<bCount; ++b2) {
+
+ for(dim_t j=0; j<dims[1]; ++j) {
+
+ dim_t j_in_off = j*istrides[1];
+ dim_t j_out_off = j*ostrides[1];
+
+ for(dim_t i=0; i<dims[0]; ++i) {
+
+ dim_t i_in_off = i*istrides[0];
+ dim_t i_out_off = i*ostrides[0];
+
+ // clear means and centers for this pixel
+ for(dim_t ch=0; ch<channels; ++ch) {
+ means[ch] = 0.0f;
+ // the expression ch*istrides[2] will only effect when ch>1
+ // i.e for color images where batch is along fourth dimension
+ centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]];
+ }
+
+ // scope of meanshift iterationd begin
+ for(unsigned it=0; it<iter; ++it) {
+
+ int count = 0;
+ int shift_x = 0;
+ int shift_y = 0;
+
+ for(dim_t wj=-radius; wj<=radius; ++wj) {
+
+ int hit_count = 0;
+
+ for(dim_t wi=-radius; wi<=radius; ++wi) {
+
+ dim_t tj = j + wj;
+ dim_t ti = i + wi;
+
+ // clamps offsets
+ tj = clamp(tj, 0ll, dims[1]-1);
+ ti = clamp(ti, 0ll, dims[0]-1);
+
+ // proceed
+ float norm = 0.0f;
+ for(dim_t ch=0; ch<channels; ++ch) {
+ tmpclrs[ch] = inData[ tj*istrides[1] + ti*istrides[0] + ch*istrides[2]];
+ norm += (centers[ch]-tmpclrs[ch]) * (centers[ch]-tmpclrs[ch]);
+ }
+
+ if (norm<= cvar) {
+ for(dim_t ch=0; ch<channels; ++ch)
+ means[ch] += tmpclrs[ch];
+ shift_x += wi;
+ ++hit_count;
+ }
+
+ }
+ count+= hit_count;
+ shift_y += wj*hit_count;
+ }
+
+ if (count==0) { break; }
+
+ const float fcount = 1.f/count;
+ const int mean_x = (int)(shift_x*fcount+0.5f);
+ const int mean_y = (int)(shift_y*fcount+0.5f);
+ for(dim_t ch=0; ch<channels; ++ch)
+ means[ch] *= fcount;
+
+ float norm = 0.f;
+ for(dim_t ch=0; ch<channels; ++ch)
+ norm += ((means[ch]-centers[ch])*(means[ch]-centers[ch]));
+ bool stop = ((abs(shift_y-mean_y)+abs(shift_x-mean_x)) + norm) <= 1;
+ shift_x = mean_x;
+ shift_y = mean_y;
+ for(dim_t ch=0; ch<channels; ++ch)
+ centers[ch] = means[ch];
+ if (stop) { break; }
+ } // scope of meanshift iterations end
+
+ for(dim_t ch=0; ch<channels; ++ch)
+ outData[j_out_off + i_out_off + ch*ostrides[2]] = centers[ch];
+
+ }
+ }
+ outData += ostrides[2];
+ inData += istrides[2];
+ }
+ }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/medfilt.hpp b/src/backend/cpu/kernel/medfilt.hpp
new file mode 100644
index 0000000..bc639a8
--- /dev/null
+++ b/src/backend/cpu/kernel/medfilt.hpp
@@ -0,0 +1,135 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <vector>
+#include <algorithm>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, af_border_type Pad>
+void medfilt(Array<T> out, const Array<T> in, dim_t w_len, dim_t w_wid)
+{
+ const af::dim4 dims = in.dims();
+ const af::dim4 istrides = in.strides();
+ const af::dim4 ostrides = out.strides();
+
+ std::vector<T> wind_vals;
+ wind_vals.reserve(w_len*w_wid);
+
+ T const * in_ptr = in.get();
+ T * out_ptr = out.get();
+
+ for(int b3=0; b3<(int)dims[3]; b3++) {
+
+ for(int b2=0; b2<(int)dims[2]; b2++) {
+
+ for(int col=0; col<(int)dims[1]; col++) {
+
+ int ocol_off = col*ostrides[1];
+
+ for(int row=0; row<(int)dims[0]; row++) {
+
+ wind_vals.clear();
+
+ for(int wj=0; wj<(int)w_wid; ++wj) {
+
+ bool isColOff = false;
+
+ int im_col = col + wj-w_wid/2;
+ int im_coff;
+ switch(Pad) {
+ case AF_PAD_ZERO:
+ im_coff = im_col * istrides[1];
+ if (im_col < 0 || im_col>=(int)dims[1])
+ isColOff = true;
+ break;
+ case AF_PAD_SYM:
+ {
+ if (im_col < 0) {
+ im_col *= -1;
+ isColOff = true;
+ }
+
+ if (im_col>=(int)dims[1]) {
+ im_col = 2*((int)dims[1]-1) - im_col;
+ isColOff = true;
+ }
+
+ im_coff = im_col * istrides[1];
+ }
+ break;
+ }
+
+ for(int wi=0; wi<(int)w_len; ++wi) {
+
+ bool isRowOff = false;
+
+ int im_row = row + wi-w_len/2;
+ int im_roff;
+ switch(Pad) {
+ case AF_PAD_ZERO:
+ im_roff = im_row * istrides[0];
+ if (im_row < 0 || im_row>=(int)dims[0])
+ isRowOff = true;
+ break;
+ case AF_PAD_SYM:
+ {
+ if (im_row < 0) {
+ im_row *= -1;
+ isRowOff = true;
+ }
+
+ if (im_row>=(int)dims[0]) {
+ im_row = 2*((int)dims[0]-1) - im_row;
+ isRowOff = true;
+ }
+
+ im_roff = im_row * istrides[0];
+ }
+ break;
+ }
+
+ if(isRowOff || isColOff) {
+ switch(Pad) {
+ case AF_PAD_ZERO:
+ wind_vals.push_back(0);
+ break;
+ case AF_PAD_SYM:
+ wind_vals.push_back(in_ptr[im_coff+im_roff]);
+ break;
+ }
+ } else
+ wind_vals.push_back(in_ptr[im_coff+im_roff]);
+ }
+ }
+
+ std::stable_sort(wind_vals.begin(),wind_vals.end());
+ int off = wind_vals.size()/2;
+ if (wind_vals.size()%2==0)
+ out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2;
+ else {
+ out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off];
+ }
+ }
+ }
+ in_ptr += istrides[2];
+ out_ptr += ostrides[2];
+ }
+ }
+}
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/morph.hpp b/src/backend/cpu/kernel/morph.hpp
new file mode 100644
index 0000000..af9b7e9
--- /dev/null
+++ b/src/backend/cpu/kernel/morph.hpp
@@ -0,0 +1,140 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <utility.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool IsDilation>
+void morph(Array<T> out, Array<T> const in, Array<T> const mask)
+{
+ const af::dim4 ostrides = out.strides();
+ const af::dim4 istrides = in.strides();
+ const af::dim4 fstrides = mask.strides();
+ const af::dim4 dims = in.dims();
+ const af::dim4 window = mask.dims();
+ T* outData = out.get();
+ const T* inData = in.get();
+ const T* filter = mask.get();
+ const dim_t R0 = window[0]/2;
+ const dim_t R1 = window[1]/2;
+
+ for(dim_t b3=0; b3<dims[3]; ++b3) {
+ for(dim_t b2=0; b2<dims[2]; ++b2) {
+ // either channels or batch is handled by outer most loop
+ for(dim_t j=0; j<dims[1]; ++j) {
+ // j steps along 2nd dimension
+ for(dim_t i=0; i<dims[0]; ++i) {
+ // i steps along 1st dimension
+ T filterResult = inData[ getIdx(istrides, i, j) ];
+
+ // wj,wi steps along 2nd & 1st dimensions of filter window respectively
+ for(dim_t wj=0; wj<window[1]; wj++) {
+ for(dim_t wi=0; wi<window[0]; wi++) {
+
+ dim_t offj = j+wj-R1;
+ dim_t offi = i+wi-R0;
+
+ T maskValue = filter[ getIdx(fstrides, wi, wj) ];
+
+ if ((maskValue > (T)0) && offi>=0 && offj>=0 && offi<dims[0] && offj<dims[1]) {
+
+ T inValue = inData[ getIdx(istrides, offi, offj) ];
+
+ if (IsDilation)
+ filterResult = std::max(filterResult, inValue);
+ else
+ filterResult = std::min(filterResult, inValue);
+ }
+
+ } // window 1st dimension loop ends here
+ } // filter window loop ends here
+
+ outData[ getIdx(ostrides, i, j) ] = filterResult;
+ } //1st dimension loop ends here
+ } // 2nd dimension loop ends here
+
+ // next iteration will be next batch if any
+ outData += ostrides[2];
+ inData += istrides[2];
+ }
+ }
+}
+
+template<typename T, bool IsDilation>
+void morph3d(Array<T> out, Array<T> const in, Array<T> const mask)
+{
+ const af::dim4 dims = in.dims();
+ const af::dim4 window = mask.dims();
+ const dim_t R0 = window[0]/2;
+ const dim_t R1 = window[1]/2;
+ const dim_t R2 = window[2]/2;
+ const af::dim4 istrides = in.strides();
+ const af::dim4 fstrides = mask.strides();
+ const dim_t bCount = dims[3];
+ const af::dim4 ostrides = out.strides();
+ T* outData = out.get();
+ const T* inData = in.get();
+ const T* filter = mask.get();
+
+ for(dim_t batchId=0; batchId<bCount; ++batchId) {
+ // either channels or batch is handled by outer most loop
+ for(dim_t k=0; k<dims[2]; ++k) {
+ // k steps along 3rd dimension
+ for(dim_t j=0; j<dims[1]; ++j) {
+ // j steps along 2nd dimension
+ for(dim_t i=0; i<dims[0]; ++i) {
+ // i steps along 1st dimension
+ T filterResult = inData[ getIdx(istrides, i, j, k) ];
+
+ // wk, wj,wi steps along 2nd & 1st dimensions of filter window respectively
+ for(dim_t wk=0; wk<window[2]; wk++) {
+ for(dim_t wj=0; wj<window[1]; wj++) {
+ for(dim_t wi=0; wi<window[0]; wi++) {
+
+ dim_t offk = k+wk-R2;
+ dim_t offj = j+wj-R1;
+ dim_t offi = i+wi-R0;
+
+ T maskValue = filter[ getIdx(fstrides, wi, wj, wk) ];
+
+ if ((maskValue > (T)0) && offi>=0 && offj>=0 && offk>=0 &&
+ offi<dims[0] && offj<dims[1] && offk<dims[2]) {
+
+ T inValue = inData[ getIdx(istrides, offi, offj, offk) ];
+
+ if (IsDilation)
+ filterResult = std::max(filterResult, inValue);
+ else
+ filterResult = std::min(filterResult, inValue);
+ }
+
+ } // window 1st dimension loop ends here
+ } // window 1st dimension loop ends here
+ }// filter window loop ends here
+
+ outData[ getIdx(ostrides, i, j, k) ] = filterResult;
+ } //1st dimension loop ends here
+ } // 2nd dimension loop ends here
+ } // 3rd dimension loop ends here
+ // next iteration will be next batch if any
+ outData += ostrides[3];
+ inData += istrides[3];
+ }
+}
+
+
+}
+}
diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp
index 9a04613..f0e1593 100644
--- a/src/backend/cpu/lu.cpp
+++ b/src/backend/cpu/lu.cpp
@@ -15,11 +15,11 @@
#include <handle.hpp>
#include <iostream>
#include <cassert>
-#include <err_cpu.hpp>
#include <range.hpp>
#include <lapack_helper.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/lu.hpp>
namespace cpu
{
@@ -42,66 +42,6 @@ LU_FUNC(getrf , cfloat , c)
LU_FUNC(getrf , cdouble, z)
template<typename T>
-void lu_split(Array<T> lower, Array<T> upper, const Array<T> in)
-{
- T *l = lower.get();
- T *u = upper.get();
- const T *i = in.get();
-
- dim4 ldm = lower.dims();
- dim4 udm = upper.dims();
- dim4 idm = in.dims();
- dim4 lst = lower.strides();
- dim4 ust = upper.strides();
- dim4 ist = in.strides();
-
- for(dim_t ow = 0; ow < idm[3]; ow++) {
- const dim_t lW = ow * lst[3];
- const dim_t uW = ow * ust[3];
- const dim_t iW = ow * ist[3];
-
- for(dim_t oz = 0; oz < idm[2]; oz++) {
- const dim_t lZW = lW + oz * lst[2];
- const dim_t uZW = uW + oz * ust[2];
- const dim_t iZW = iW + oz * ist[2];
-
- for(dim_t oy = 0; oy < idm[1]; oy++) {
- const dim_t lYZW = lZW + oy * lst[1];
- const dim_t uYZW = uZW + oy * ust[1];
- const dim_t iYZW = iZW + oy * ist[1];
-
- for(dim_t ox = 0; ox < idm[0]; ox++) {
- const dim_t lMem = lYZW + ox;
- const dim_t uMem = uYZW + ox;
- const dim_t iMem = iYZW + ox;
- if(ox > oy) {
- if(oy < ldm[1]) l[lMem] = i[iMem];
- if(ox < udm[0]) u[uMem] = scalar<T>(0);
- } else if (oy > ox) {
- if(oy < ldm[1]) l[lMem] = scalar<T>(0);
- if(ox < udm[0]) u[uMem] = i[iMem];
- } else if(ox == oy) {
- if(oy < ldm[1]) l[lMem] = scalar<T>(1.0);
- if(ox < udm[0]) u[uMem] = i[iMem];
- }
- }
- }
- }
- }
-}
-
-void convertPivot(Array<int> p, Array<int> pivot)
-{
- int *d_pi = pivot.get();
- int *d_po = p.get();
- dim_t d0 = pivot.dims()[0];
- for(int j = 0; j < (int)d0; j++) {
- // 1 indexed in pivot
- std::swap(d_po[j], d_po[d_pi[j] - 1]);
- }
-}
-
-template<typename T>
void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
{
in.eval();
@@ -119,7 +59,7 @@ void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
lower = createEmptyArray<T>(ldims);
upper = createEmptyArray<T>(udims);
- getQueue().enqueue(lu_split<T>, lower, upper, in_copy);
+ getQueue().enqueue(kernel::lu_split<T>, lower, upper, in_copy);
}
template<typename T>
@@ -138,7 +78,7 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
if(convert_pivot) {
Array<int> p = range<int>(dim4(iDims[0]), 0);
- getQueue().enqueue(convertPivot, p, pivot);
+ getQueue().enqueue(kernel::convertPivot, p, pivot);
return p;
} else {
return pivot;
diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp
index d4ce95a..e5b030b 100644
--- a/src/backend/cpu/match_template.cpp
+++ b/src/backend/cpu/match_template.cpp
@@ -12,141 +12,24 @@
#include <ArrayInfo.hpp>
#include <Array.hpp>
#include <match_template.hpp>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/match_template.hpp>
using af::dim4;
namespace cpu
{
-template<typename inType, typename outType, af_match_type mType>
-Array<outType> match_template(const Array<inType> &sImg, const Array<inType> &tImg)
+template<typename InT, typename OutT, af_match_type MatchT>
+Array<OutT> match_template(const Array<InT> &sImg, const Array<InT> &tImg)
{
sImg.eval();
tImg.eval();
- Array<outType> out = createEmptyArray<outType>(sImg.dims());
+ Array<OutT> out = createEmptyArray<OutT>(sImg.dims());
- auto func = [=](Array<outType> out, const Array<inType> sImg, const Array<inType> tImg) {
- const dim4 sDims = sImg.dims();
- const dim4 tDims = tImg.dims();
- const dim4 sStrides = sImg.strides();
- const dim4 tStrides = tImg.strides();
-
- const dim_t tDim0 = tDims[0];
- const dim_t tDim1 = tDims[1];
- const dim_t sDim0 = sDims[0];
- const dim_t sDim1 = sDims[1];
-
- const dim4 oStrides = out.strides();
-
- outType tImgMean = outType(0);
- dim_t winNumElements = tImg.elements();
- bool needMean = mType==AF_ZSAD || mType==AF_LSAD ||
- mType==AF_ZSSD || mType==AF_LSSD ||
- mType==AF_ZNCC;
- const inType * tpl = tImg.get();
-
- if (needMean) {
- for(dim_t tj=0; tj<tDim1; tj++) {
- dim_t tjStride = tj*tStrides[1];
-
- for(dim_t ti=0; ti<tDim0; ti++) {
- tImgMean += (outType)tpl[tjStride+ti*tStrides[0]];
- }
- }
- tImgMean /= winNumElements;
- }
-
- outType * dst = out.get();
- const inType * src = sImg.get();
-
- for(dim_t b3=0; b3<sDims[3]; ++b3) {
- for(dim_t b2=0; b2<sDims[2]; ++b2) {
-
- // slide through image window after window
- for(dim_t sj=0; sj<sDim1; sj++) {
-
- dim_t ojStride = sj*oStrides[1];
-
- for(dim_t si=0; si<sDim0; si++) {
- outType disparity = outType(0);
-
- // mean for window
- // this variable will be used based on mType value
- outType wImgMean = outType(0);
- if (needMean) {
- for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
- dim_t jStride = j*sStrides[1];
-
- for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
- inType sVal = ((j<sDim1 && i<sDim0) ?
- src[jStride + i*sStrides[0]] : inType(0));
- wImgMean += (outType)sVal;
- }
- }
- wImgMean /= winNumElements;
- }
-
- // run the window match metric
- for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) {
- dim_t jStride = j*sStrides[1];
- dim_t tjStride = tj*tStrides[1];
-
- for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) {
- inType sVal = ((j<sDim1 && i<sDim0) ?
- src[jStride + i*sStrides[0]] : inType(0));
- inType tVal = tpl[tjStride+ti*tStrides[0]];
- outType temp;
- switch(mType) {
- case AF_SAD:
- disparity += fabs((outType)sVal-(outType)tVal);
- break;
- case AF_ZSAD:
- disparity += fabs((outType)sVal - wImgMean -
- (outType)tVal + tImgMean);
- break;
- case AF_LSAD:
- disparity += fabs((outType)sVal-(wImgMean/tImgMean)*tVal);
- break;
- case AF_SSD:
- disparity += ((outType)sVal-(outType)tVal)*((outType)sVal-(outType)tVal);
- break;
- case AF_ZSSD:
- temp = ((outType)sVal - wImgMean - (outType)tVal + tImgMean);
- disparity += temp*temp;
- break;
- case AF_LSSD:
- temp = ((outType)sVal-(wImgMean/tImgMean)*tVal);
- disparity += temp*temp;
- break;
- case AF_NCC:
- //TODO: furture implementation
- break;
- case AF_ZNCC:
- //TODO: furture implementation
- break;
- case AF_SHD:
- //TODO: furture implementation
- break;
- }
- }
- }
- // output is just created, hence not doing the
- // extra multiplication for 0th dim stride
- dst[ojStride + si] = disparity;
- }
- }
- src += sStrides[2];
- dst += oStrides[2];
- }
- src += sStrides[3];
- dst += oStrides[3];
- }
- };
- getQueue().enqueue(func, out, sImg, tImg);
+ getQueue().enqueue(kernel::matchTemplate<OutT, InT, MatchT>, out, sImg, tImg);
return out;
}
diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp
index 62b80e0..6c3417a 100644
--- a/src/backend/cpu/meanshift.cpp
+++ b/src/backend/cpu/meanshift.cpp
@@ -18,6 +18,7 @@
#include <math.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/meanshift.hpp>
using af::dim4;
using std::vector;
@@ -25,11 +26,6 @@ using std::vector;
namespace cpu
{
-inline dim_t clamp(dim_t a, dim_t mn, dim_t mx)
-{
- return (a<mn ? mn : (a>mx ? mx : a));
-}
-
template<typename T, bool is_color>
Array<T> meanshift(const Array<T> &in, const float &s_sigma, const float &c_sigma, const unsigned iter)
{
@@ -37,120 +33,7 @@ Array<T> meanshift(const Array<T> &in, const float &s_sigma, const float &c_sig
Array<T> out = createEmptyArray<T>(in.dims());
- auto func = [=] (Array<T> out, const Array<T> in, const float s_sigma,
- const float c_sigma, const unsigned iter) {
- const dim4 dims = in.dims();
- const dim4 istrides = in.strides();
- const dim4 ostrides = out.strides();
-
- const dim_t bCount = (is_color ? 1 : dims[2]);
- const dim_t channels = (is_color ? dims[2] : 1);
-
- // clamp spatical and chromatic sigma's
- float space_ = std::min(11.5f, s_sigma);
- const dim_t radius = std::max((int)(space_ * 1.5f), 1);
- const float cvar = c_sigma*c_sigma;
-
- vector<float> means;
- vector<float> centers;
- vector<float> tmpclrs;
- means.reserve(channels);
- centers.reserve(channels);
- tmpclrs.reserve(channels);
-
- T *outData = out.get();
- const T * inData = in.get();
-
- for(dim_t b3=0; b3<dims[3]; ++b3) {
- for(dim_t b2=0; b2<bCount; ++b2) {
-
- for(dim_t j=0; j<dims[1]; ++j) {
-
- dim_t j_in_off = j*istrides[1];
- dim_t j_out_off = j*ostrides[1];
-
- for(dim_t i=0; i<dims[0]; ++i) {
-
- dim_t i_in_off = i*istrides[0];
- dim_t i_out_off = i*ostrides[0];
-
- // clear means and centers for this pixel
- for(dim_t ch=0; ch<channels; ++ch) {
- means[ch] = 0.0f;
- // the expression ch*istrides[2] will only effect when ch>1
- // i.e for color images where batch is along fourth dimension
- centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]];
- }
-
- // scope of meanshift iterationd begin
- for(unsigned it=0; it<iter; ++it) {
-
- int count = 0;
- int shift_x = 0;
- int shift_y = 0;
-
- for(dim_t wj=-radius; wj<=radius; ++wj) {
-
- int hit_count = 0;
-
- for(dim_t wi=-radius; wi<=radius; ++wi) {
-
- dim_t tj = j + wj;
- dim_t ti = i + wi;
-
- // clamps offsets
- tj = clamp(tj, 0ll, dims[1]-1);
- ti = clamp(ti, 0ll, dims[0]-1);
-
- // proceed
- float norm = 0.0f;
- for(dim_t ch=0; ch<channels; ++ch) {
- tmpclrs[ch] = inData[ tj*istrides[1] + ti*istrides[0] + ch*istrides[2]];
- norm += (centers[ch]-tmpclrs[ch]) * (centers[ch]-tmpclrs[ch]);
- }
-
- if (norm<= cvar) {
- for(dim_t ch=0; ch<channels; ++ch)
- means[ch] += tmpclrs[ch];
- shift_x += wi;
- ++hit_count;
- }
-
- }
- count+= hit_count;
- shift_y += wj*hit_count;
- }
-
- if (count==0) { break; }
-
- const float fcount = 1.f/count;
- const int mean_x = (int)(shift_x*fcount+0.5f);
- const int mean_y = (int)(shift_y*fcount+0.5f);
- for(dim_t ch=0; ch<channels; ++ch)
- means[ch] *= fcount;
-
- float norm = 0.f;
- for(dim_t ch=0; ch<channels; ++ch)
- norm += ((means[ch]-centers[ch])*(means[ch]-centers[ch]));
- bool stop = ((abs(shift_y-mean_y)+abs(shift_x-mean_x)) + norm) <= 1;
- shift_x = mean_x;
- shift_y = mean_y;
- for(dim_t ch=0; ch<channels; ++ch)
- centers[ch] = means[ch];
- if (stop) { break; }
- } // scope of meanshift iterations end
-
- for(dim_t ch=0; ch<channels; ++ch)
- outData[j_out_off + i_out_off + ch*ostrides[2]] = centers[ch];
-
- }
- }
- outData += ostrides[2];
- inData += istrides[2];
- }
- }
- };
- getQueue().enqueue(func, out, in, s_sigma, c_sigma, iter);
+ getQueue().enqueue(kernel::meanShift<T, is_color>, out, in, s_sigma, c_sigma, iter);
return out;
}
diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp
index 4e74a55..06cc0df 100644
--- a/src/backend/cpu/medfilt.cpp
+++ b/src/backend/cpu/medfilt.cpp
@@ -12,10 +12,9 @@
#include <ArrayInfo.hpp>
#include <Array.hpp>
#include <medfilt.hpp>
-#include <err_cpu.hpp>
-#include <algorithm>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/medfilt.hpp>
using af::dim4;
@@ -27,119 +26,9 @@ Array<T> medfilt(const Array<T> &in, dim_t w_len, dim_t w_wid)
{
in.eval();
- Array<T> out = createEmptyArray<T>(in.dims());
+ Array<T> out = createEmptyArray<T>(in.dims());
- auto func = [=] (Array<T> out, const Array<T> in,
- dim_t w_len, dim_t w_wid) {
- const dim4 dims = in.dims();
- const dim4 istrides = in.strides();
- const dim4 ostrides = out.strides();
-
- std::vector<T> wind_vals;
- wind_vals.reserve(w_len*w_wid);
-
- T const * in_ptr = in.get();
- T * out_ptr = out.get();
-
- for(int b3=0; b3<(int)dims[3]; b3++) {
-
- for(int b2=0; b2<(int)dims[2]; b2++) {
-
- for(int col=0; col<(int)dims[1]; col++) {
-
- int ocol_off = col*ostrides[1];
-
- for(int row=0; row<(int)dims[0]; row++) {
-
- wind_vals.clear();
-
- for(int wj=0; wj<(int)w_wid; ++wj) {
-
- bool isColOff = false;
-
- int im_col = col + wj-w_wid/2;
- int im_coff;
- switch(pad) {
- case AF_PAD_ZERO:
- im_coff = im_col * istrides[1];
- if (im_col < 0 || im_col>=(int)dims[1])
- isColOff = true;
- break;
- case AF_PAD_SYM:
- {
- if (im_col < 0) {
- im_col *= -1;
- isColOff = true;
- }
-
- if (im_col>=(int)dims[1]) {
- im_col = 2*((int)dims[1]-1) - im_col;
- isColOff = true;
- }
-
- im_coff = im_col * istrides[1];
- }
- break;
- }
-
- for(int wi=0; wi<(int)w_len; ++wi) {
-
- bool isRowOff = false;
-
- int im_row = row + wi-w_len/2;
- int im_roff;
- switch(pad) {
- case AF_PAD_ZERO:
- im_roff = im_row * istrides[0];
- if (im_row < 0 || im_row>=(int)dims[0])
- isRowOff = true;
- break;
- case AF_PAD_SYM:
- {
- if (im_row < 0) {
- im_row *= -1;
- isRowOff = true;
- }
-
- if (im_row>=(int)dims[0]) {
- im_row = 2*((int)dims[0]-1) - im_row;
- isRowOff = true;
- }
-
- im_roff = im_row * istrides[0];
- }
- break;
- }
-
- if(isRowOff || isColOff) {
- switch(pad) {
- case AF_PAD_ZERO:
- wind_vals.push_back(0);
- break;
- case AF_PAD_SYM:
- wind_vals.push_back(in_ptr[im_coff+im_roff]);
- break;
- }
- } else
- wind_vals.push_back(in_ptr[im_coff+im_roff]);
- }
- }
-
- std::stable_sort(wind_vals.begin(),wind_vals.end());
- int off = wind_vals.size()/2;
- if (wind_vals.size()%2==0)
- out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2;
- else {
- out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off];
- }
- }
- }
- in_ptr += istrides[2];
- out_ptr += ostrides[2];
- }
- }
- };
- getQueue().enqueue(func, out, in, w_len, w_wid);
+ getQueue().enqueue(kernel::medfilt<T, pad>, out, in, w_len, w_wid);
return out;
}
diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp
index 945c32b..462319d 100644
--- a/src/backend/cpu/morph.cpp
+++ b/src/backend/cpu/morph.cpp
@@ -15,21 +15,13 @@
#include <algorithm>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/morph.hpp>
using af::dim4;
namespace cpu
{
-static inline unsigned getIdx(const dim4 &strides,
- int i, int j = 0, int k = 0, int l = 0)
-{
- return (l * strides[3] +
- k * strides[2] +
- j * strides[1] +
- i * strides[0]);
-}
-
template<typename T, bool isDilation>
Array<T> morph(const Array<T> &in, const Array<T> &mask)
{
@@ -38,60 +30,7 @@ Array<T> morph(const Array<T> &in, const Array<T> &mask)
Array<T> out = createEmptyArray<T>(in.dims());
- auto func = [=] (Array<T> out, const Array<T> in, const Array<T> mask) {
- const dim4 ostrides = out.strides();
- const dim4 istrides = in.strides();
- const dim4 fstrides = mask.strides();
- const dim4 dims = in.dims();
- const dim4 window = mask.dims();
- T* outData = out.get();
- const T* inData = in.get();
- const T* filter = mask.get();
- const dim_t R0 = window[0]/2;
- const dim_t R1 = window[1]/2;
-
- for(dim_t b3=0; b3<dims[3]; ++b3) {
- for(dim_t b2=0; b2<dims[2]; ++b2) {
- // either channels or batch is handled by outer most loop
- for(dim_t j=0; j<dims[1]; ++j) {
- // j steps along 2nd dimension
- for(dim_t i=0; i<dims[0]; ++i) {
- // i steps along 1st dimension
- T filterResult = inData[ getIdx(istrides, i, j) ];
-
- // wj,wi steps along 2nd & 1st dimensions of filter window respectively
- for(dim_t wj=0; wj<window[1]; wj++) {
- for(dim_t wi=0; wi<window[0]; wi++) {
-
- dim_t offj = j+wj-R1;
- dim_t offi = i+wi-R0;
-
- T maskValue = filter[ getIdx(fstrides, wi, wj) ];
-
- if ((maskValue > (T)0) && offi>=0 && offj>=0 && offi<dims[0] && offj<dims[1]) {
-
- T inValue = inData[ getIdx(istrides, offi, offj) ];
-
- if (isDilation)
- filterResult = std::max(filterResult, inValue);
- else
- filterResult = std::min(filterResult, inValue);
- }
-
- } // window 1st dimension loop ends here
- } // filter window loop ends here
-
- outData[ getIdx(ostrides, i, j) ] = filterResult;
- } //1st dimension loop ends here
- } // 2nd dimension loop ends here
-
- // next iteration will be next batch if any
- outData += ostrides[2];
- inData += istrides[2];
- }
- }
- };
- getQueue().enqueue(func, out, in, mask);
+ getQueue().enqueue(kernel::morph<T, isDilation>, out, in, mask);
return out;
}
@@ -104,66 +43,7 @@ Array<T> morph3d(const Array<T> &in, const Array<T> &mask)
Array<T> out = createEmptyArray<T>(in.dims());
- auto func = [=] (Array<T> out, const Array<T> in, const Array<T> mask) {
- const dim4 dims = in.dims();
- const dim4 window = mask.dims();
- const dim_t R0 = window[0]/2;
- const dim_t R1 = window[1]/2;
- const dim_t R2 = window[2]/2;
- const dim4 istrides = in.strides();
- const dim4 fstrides = mask.strides();
- const dim_t bCount = dims[3];
- const dim4 ostrides = out.strides();
- T* outData = out.get();
- const T* inData = in.get();
- const T* filter = mask.get();
-
- for(dim_t batchId=0; batchId<bCount; ++batchId) {
- // either channels or batch is handled by outer most loop
- for(dim_t k=0; k<dims[2]; ++k) {
- // k steps along 3rd dimension
- for(dim_t j=0; j<dims[1]; ++j) {
- // j steps along 2nd dimension
- for(dim_t i=0; i<dims[0]; ++i) {
- // i steps along 1st dimension
- T filterResult = inData[ getIdx(istrides, i, j, k) ];
-
- // wk, wj,wi steps along 2nd & 1st dimensions of filter window respectively
- for(dim_t wk=0; wk<window[2]; wk++) {
- for(dim_t wj=0; wj<window[1]; wj++) {
- for(dim_t wi=0; wi<window[0]; wi++) {
-
- dim_t offk = k+wk-R2;
- dim_t offj = j+wj-R1;
- dim_t offi = i+wi-R0;
-
- T maskValue = filter[ getIdx(fstrides, wi, wj, wk) ];
-
- if ((maskValue > (T)0) && offi>=0 && offj>=0 && offk>=0 &&
- offi<dims[0] && offj<dims[1] && offk<dims[2]) {
-
- T inValue = inData[ getIdx(istrides, offi, offj, offk) ];
-
- if (isDilation)
- filterResult = std::max(filterResult, inValue);
- else
- filterResult = std::min(filterResult, inValue);
- }
-
- } // window 1st dimension loop ends here
- } // window 1st dimension loop ends here
- }// filter window loop ends here
-
- outData[ getIdx(ostrides, i, j, k) ] = filterResult;
- } //1st dimension loop ends here
- } // 2nd dimension loop ends here
- } // 3rd dimension loop ends here
- // next iteration will be next batch if any
- outData += ostrides[3];
- inData += istrides[3];
- }
- };
- getQueue().enqueue(func, out, in, mask);
+ getQueue().enqueue(kernel::morph3d<T, isDilation>, out, in, mask);
return out;
}
diff --git a/src/backend/cpu/utility.hpp b/src/backend/cpu/utility.hpp
index ed8bbd7..68cef5a 100644
--- a/src/backend/cpu/utility.hpp
+++ b/src/backend/cpu/utility.hpp
@@ -31,9 +31,9 @@ dim_t trimIndex(int const & idx, dim_t const & len)
}
static inline
-dim_t clamp(int a, dim_t mn, dim_t mx)
+dim_t clamp(dim_t a, dim_t mn, dim_t mx)
{
- return (a < (int)mn ? mn : (a > (int)mx ? mx : a));
+ return (a<mn ? mn : (a>mx ? mx : a));
}
static inline
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list