[arrayfire] 81/284: Added missing eval for input Array's in cpu backend fns
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Sun Feb 7 18:59:21 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.
commit 3ba9633e8d1dba645ba29695ef99e11a616ef2f2
Author: pradeep <pradeep at arrayfire.com>
Date: Thu Dec 17 14:22:46 2015 -0500
Added missing eval for input Array's in cpu backend fns
This change has some style fixes related to cpu namespace
---
src/backend/cpu/Array.cpp | 402 ++++----
src/backend/cpu/approx.cpp | 584 +++++------
src/backend/cpu/bilateral.cpp | 1 +
src/backend/cpu/cholesky.cpp | 4 +
src/backend/cpu/copy.cpp | 278 ++---
src/backend/cpu/diagonal.cpp | 125 +--
src/backend/cpu/diff.cpp | 205 ++--
src/backend/cpu/exampleFunction.cpp | 7 +
src/backend/cpu/fft.cpp | 6 +
src/backend/cpu/gradient.cpp | 1 +
src/backend/cpu/identity.cpp | 2 +
src/backend/cpu/iota.cpp | 2 +
src/backend/cpu/ireduce.cpp | 282 ++---
src/backend/cpu/lu.cpp | 6 +
src/backend/cpu/match_template.cpp | 3 +
src/backend/cpu/math.cpp | 72 +-
src/backend/cpu/meanshift.cpp | 2 +
src/backend/cpu/medfilt.cpp | 2 +
src/backend/cpu/memory.cpp | 333 +++---
src/backend/cpu/nearest_neighbour.cpp | 2 -
src/backend/cpu/qr.cpp | 6 +
src/backend/cpu/range.cpp | 2 +
src/backend/cpu/reorder.cpp | 98 +-
src/backend/cpu/resize.cpp | 2 +
src/backend/cpu/rotate.cpp | 178 ++--
src/backend/cpu/set.cpp | 178 ++--
src/backend/cpu/shift.cpp | 6 +-
src/backend/cpu/sift_nonfree.hpp | 1811 +++++++++++++++++----------------
src/backend/cpu/sobel.cpp | 1 +
src/backend/cpu/solve.cpp | 11 +-
src/backend/cpu/sort.cpp | 96 +-
src/backend/cpu/sort_index.cpp | 140 +--
src/backend/cpu/susan.cpp | 2 +
src/backend/cpu/svd.cpp | 147 +--
src/backend/cpu/transform.cpp | 4 +-
src/backend/cpu/transpose.cpp | 1 -
src/backend/cpu/triangle.cpp | 25 +-
src/backend/cpu/unwrap.cpp | 3 +-
src/backend/cpu/where.cpp | 81 +-
39 files changed, 2606 insertions(+), 2505 deletions(-)
diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp
index 456f4c8..9c15bc4 100644
--- a/src/backend/cpu/Array.cpp
+++ b/src/backend/cpu/Array.cpp
@@ -21,250 +21,251 @@
namespace cpu
{
- const int MAX_TNJ_LEN = 20;
- using TNJ::BufferNode;
- using TNJ::Node;
- using TNJ::Node_ptr;
-
- using af::dim4;
-
- template<typename T>
- Array<T>::Array(dim4 dims):
- info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
- data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
- node(), offset(0), ready(true), owner(true)
- { }
-
- template<typename T>
- Array<T>::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device):
- info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
- data((is_device & !copy_device) ? (T*)in_data : memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
- node(), offset(0), ready(true), owner(true)
- {
- static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
- static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
- if (!is_device || copy_device) {
- std::copy(in_data, in_data + dims.elements(), data.get());
- }
- }
- template<typename T>
- Array<T>::Array(af::dim4 dims, TNJ::Node_ptr n) :
- info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
- data(), data_dims(dims),
- node(n), offset(0), ready(false), owner(true)
- {
+const int MAX_TNJ_LEN = 20;
+using TNJ::BufferNode;
+using TNJ::Node;
+using TNJ::Node_ptr;
+
+using af::dim4;
+
+template<typename T>
+Array<T>::Array(dim4 dims):
+ info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+ data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
+ node(), offset(0), ready(true), owner(true)
+{ }
+
+template<typename T>
+Array<T>::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device):
+ info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+ data((is_device & !copy_device) ? (T*)in_data : memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
+ node(), offset(0), ready(true), owner(true)
+{
+ static_assert(std::is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type");
+ static_assert(offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>");
+ if (!is_device || copy_device) {
+ std::copy(in_data, in_data + dims.elements(), data.get());
}
+}
- template<typename T>
- Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) :
- info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type),
- data(parent.getData()), data_dims(parent.getDataDims()),
- node(),
- offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
- ready(true), owner(false)
- { }
+template<typename T>
+Array<T>::Array(af::dim4 dims, TNJ::Node_ptr n) :
+ info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
+ data(), data_dims(dims),
+ node(n), offset(0), ready(false), owner(true)
+{
+}
+template<typename T>
+Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) :
+ info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type),
+ data(parent.getData()), data_dims(parent.getDataDims()),
+ node(),
+ offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
+ ready(true), owner(false)
+{ }
- template<typename T>
- void Array<T>::eval()
- {
- if (isReady()) return;
- if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);
- this->setId(getActiveDeviceId());
+template<typename T>
+void Array<T>::eval()
+{
+ if (isReady()) return;
+ if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);
- data = std::shared_ptr<T>(memAlloc<T>(elements()), memFree<T>);
+ this->setId(getActiveDeviceId());
- auto func = [] (Array<T> in) {
- in.setId(getActiveDeviceId());
- T *ptr = in.data.get();
+ data = std::shared_ptr<T>(memAlloc<T>(elements()), memFree<T>);
- dim4 odims = in.dims();
- dim4 ostrs = in.strides();
+ auto func = [] (Array<T> in) {
+ in.setId(getActiveDeviceId());
+ T *ptr = in.data.get();
- bool is_linear = in.node->isLinear(odims.get());
+ dim4 odims = in.dims();
+ dim4 ostrs = in.strides();
- if (is_linear) {
- int num = in.elements();
- for (int i = 0; i < num; i++) {
- ptr[i] = *(T *)in.node->calc(i);
- }
- } else {
- for (int w = 0; w < (int)odims[3]; w++) {
- dim_t offw = w * ostrs[3];
+ bool is_linear = in.node->isLinear(odims.get());
+
+ if (is_linear) {
+ int num = in.elements();
+ for (int i = 0; i < num; i++) {
+ ptr[i] = *(T *)in.node->calc(i);
+ }
+ } else {
+ for (int w = 0; w < (int)odims[3]; w++) {
+ dim_t offw = w * ostrs[3];
- for (int z = 0; z < (int)odims[2]; z++) {
- dim_t offz = z * ostrs[2] + offw;
+ for (int z = 0; z < (int)odims[2]; z++) {
+ dim_t offz = z * ostrs[2] + offw;
- for (int y = 0; y < (int)odims[1]; y++) {
- dim_t offy = y * ostrs[1] + offz;
+ for (int y = 0; y < (int)odims[1]; y++) {
+ dim_t offy = y * ostrs[1] + offz;
- for (int x = 0; x < (int)odims[0]; x++) {
- dim_t id = x + offy;
+ for (int x = 0; x < (int)odims[0]; x++) {
+ dim_t id = x + offy;
- ptr[id] = *(T *)in.node->calc(x, y, z, w);
- }
+ ptr[id] = *(T *)in.node->calc(x, y, z, w);
}
}
}
}
- };
-
- getQueue().enqueue(func, *this);
+ }
+ };
- ready = true;
- Node_ptr prev = node;
- prev->reset();
- // FIXME: Replace the current node in any JIT possible trees with the new BufferNode
- node.reset();
- }
+ getQueue().enqueue(func, *this);
- template<typename T>
- void Array<T>::eval() const
- {
- if (isReady()) return;
- const_cast<Array<T> *>(this)->eval();
- }
+ ready = true;
+ Node_ptr prev = node;
+ prev->reset();
+ // FIXME: Replace the current node in any JIT possible trees with the new BufferNode
+ node.reset();
+}
- template<typename T>
- Node_ptr Array<T>::getNode() const
- {
- if (!node) {
+template<typename T>
+void Array<T>::eval() const
+{
+ if (isReady()) return;
+ const_cast<Array<T> *>(this)->eval();
+}
- unsigned bytes = this->getDataDims().elements() * sizeof(T);
+template<typename T>
+Node_ptr Array<T>::getNode() const
+{
+ if (!node) {
- BufferNode<T> *buf_node = new BufferNode<T>(data,
- bytes,
- offset,
- dims().get(),
- strides().get(),
- isLinear());
+ unsigned bytes = this->getDataDims().elements() * sizeof(T);
- const_cast<Array<T> *>(this)->node = Node_ptr(reinterpret_cast<Node *>(buf_node));
- }
+ BufferNode<T> *buf_node = new BufferNode<T>(data,
+ bytes,
+ offset,
+ dims().get(),
+ strides().get(),
+ isLinear());
- return node;
+ const_cast<Array<T> *>(this)->node = Node_ptr(reinterpret_cast<Node *>(buf_node));
}
- template<typename T>
- Array<T>
- createHostDataArray(const dim4 &size, const T * const data)
- {
- return Array<T>(size, data, false);
- }
+ return node;
+}
- template<typename T>
- Array<T>
- createDeviceDataArray(const dim4 &size, const void *data)
- {
- return Array<T>(size, (const T * const) data, true);
- }
+template<typename T>
+Array<T>
+createHostDataArray(const dim4 &size, const T * const data)
+{
+ return Array<T>(size, data, false);
+}
- template<typename T>
- Array<T>
- createValueArray(const dim4 &size, const T& value)
- {
- TNJ::ScalarNode<T> *node = new TNJ::ScalarNode<T>(value);
- return createNodeArray<T>(size, TNJ::Node_ptr(
- reinterpret_cast<TNJ::Node *>(node)));
- }
+template<typename T>
+Array<T>
+createDeviceDataArray(const dim4 &size, const void *data)
+{
+ return Array<T>(size, (const T * const) data, true);
+}
- template<typename T>
- Array<T>
- createEmptyArray(const dim4 &size)
- {
- return Array<T>(size);
- }
+template<typename T>
+Array<T>
+createValueArray(const dim4 &size, const T& value)
+{
+ TNJ::ScalarNode<T> *node = new TNJ::ScalarNode<T>(value);
+ return createNodeArray<T>(size, TNJ::Node_ptr(
+ reinterpret_cast<TNJ::Node *>(node)));
+}
- template<typename T>
- Array<T> *initArray() { return new Array<T>(dim4(0, 0, 0, 0)); }
+template<typename T>
+Array<T>
+createEmptyArray(const dim4 &size)
+{
+ return Array<T>(size);
+}
+template<typename T>
+Array<T> *initArray() { return new Array<T>(dim4(0, 0, 0, 0)); }
- template<typename T>
- Array<T>
- createNodeArray(const dim4 &dims, Node_ptr node)
- {
- Array<T> out = Array<T>(dims, node);
- unsigned length =0, buf_count = 0, bytes = 0;
+template<typename T>
+Array<T>
+createNodeArray(const dim4 &dims, Node_ptr node)
+{
+ Array<T> out = Array<T>(dims, node);
- Node *n = node.get();
- n->getInfo(length, buf_count, bytes);
- n->reset();
+ unsigned length =0, buf_count = 0, bytes = 0;
- if (length > MAX_TNJ_LEN ||
- buf_count >= MAX_BUFFERS ||
- bytes >= MAX_BYTES) {
- out.eval();
- }
+ Node *n = node.get();
+ n->getInfo(length, buf_count, bytes);
+ n->reset();
- return out;
+ if (length > MAX_TNJ_LEN ||
+ buf_count >= MAX_BUFFERS ||
+ bytes >= MAX_BYTES) {
+ out.eval();
}
+ return out;
+}
- template<typename T>
- Array<T> createSubArray(const Array<T>& parent,
- const std::vector<af_seq> &index,
- bool copy)
- {
- parent.eval();
- dim4 dDims = parent.getDataDims();
- dim4 pDims = parent.dims();
+template<typename T>
+Array<T> createSubArray(const Array<T>& parent,
+ const std::vector<af_seq> &index,
+ bool copy)
+{
+ parent.eval();
- dim4 dims = toDims (index, pDims);
- dim4 offset = toOffset(index, dDims);
- dim4 stride = toStride (index, dDims);
+ dim4 dDims = parent.getDataDims();
+ dim4 pDims = parent.dims();
- Array<T> out = Array<T>(parent, dims, offset, stride);
+ dim4 dims = toDims (index, pDims);
+ dim4 offset = toOffset(index, dDims);
+ dim4 stride = toStride (index, dDims);
- if (!copy) return out;
+ Array<T> out = Array<T>(parent, dims, offset, stride);
- if (stride[0] != 1 ||
- stride[1] < 0 ||
- stride[2] < 0 ||
- stride[3] < 0) {
+ if (!copy) return out;
- out = copyArray(out);
- }
+ if (stride[0] != 1 ||
+ stride[1] < 0 ||
+ stride[2] < 0 ||
+ stride[3] < 0) {
- return out;
+ out = copyArray(out);
}
- template<typename T>
- void
- destroyArray(Array<T> *A)
- {
- delete A;
- }
+ return out;
+}
+template<typename T>
+void
+destroyArray(Array<T> *A)
+{
+ delete A;
+}
- template<typename T>
- void evalArray(const Array<T> &A)
- {
- A.eval();
- }
- template<typename T>
- void
- writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
- {
- if(!arr.isOwner()) {
- arr = createEmptyArray<T>(arr.dims());
- }
- memcpy(arr.get() + arr.getOffset(), data, bytes);
+template<typename T>
+void evalArray(const Array<T> &A)
+{
+ A.eval();
+}
+
+template<typename T>
+void
+writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
+{
+ if(!arr.isOwner()) {
+ arr = createEmptyArray<T>(arr.dims());
}
+ memcpy(arr.get() + arr.getOffset(), data, bytes);
+}
- template<typename T>
- void
- writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
- {
- if(!arr.isOwner()) {
- arr = createEmptyArray<T>(arr.dims());
- }
- memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes);
+template<typename T>
+void
+writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
+{
+ if(!arr.isOwner()) {
+ arr = createEmptyArray<T>(arr.dims());
}
+ memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes);
+}
#define INSTANTIATE(T) \
template Array<T> createHostDataArray<T> (const dim4 &size, const T * const data); \
@@ -286,16 +287,17 @@ namespace cpu
template void writeHostDataArray<T> (Array<T> &arr, const T * const data, const size_t bytes); \
template void writeDeviceDataArray<T> (Array<T> &arr, const void * const data, const size_t bytes); \
- INSTANTIATE(float)
- INSTANTIATE(double)
- INSTANTIATE(cfloat)
- INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(uchar)
- INSTANTIATE(char)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
- INSTANTIATE(short)
- INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
}
diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp
index 4d3c880..7988863 100644
--- a/src/backend/cpu/approx.cpp
+++ b/src/backend/cpu/approx.cpp
@@ -17,330 +17,339 @@
namespace cpu
{
- ///////////////////////////////////////////////////////////////////////////
- // Approx1
- ///////////////////////////////////////////////////////////////////////////
- template<typename Ty, typename Tp, af_interp_type method>
- struct approx1_op
+
+///////////////////////////////////////////////////////////////////////////
+// Approx1
+///////////////////////////////////////////////////////////////////////////
+template<typename Ty, typename Tp, af_interp_type method>
+struct approx1_op
+{
+ void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+ const Ty *in, const af::dim4 &idims, const dim_t iElems,
+ const Tp *pos, const af::dim4 &pdims,
+ const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
+ const float offGrid, const bool pBatch,
+ const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
{
- void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims,
- const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
- const float offGrid, const bool pBatch,
- const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
- {
- return;
- }
- };
+ return;
+ }
+};
- template<typename Ty, typename Tp>
- struct approx1_op<Ty, Tp, AF_INTERP_NEAREST>
+template<typename Ty, typename Tp>
+struct approx1_op<Ty, Tp, AF_INTERP_NEAREST>
+{
+ void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+ const Ty *in, const af::dim4 &idims, const dim_t iElems,
+ const Tp *pos, const af::dim4 &pdims,
+ const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
+ const float offGrid, const bool pBatch,
+ const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
{
- void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims,
- const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
- const float offGrid, const bool pBatch,
- const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
- {
- dim_t pmId = idx;
- if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
-
- const Tp x = pos[pmId];
- bool gFlag = false;
- if (x < 0 || idims[0] < x+1) { // No need to check y
- gFlag = true;
- }
+ dim_t pmId = idx;
+ if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
- const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
- + idy * ostrides[1] + idx;
- if(gFlag) {
- out[omId] = scalar<Ty>(offGrid);
- } else {
- dim_t ioff = idw * istrides[3] + idz * istrides[2]
- + idy * istrides[1];
- const dim_t iMem = round(x) + ioff;
+ const Tp x = pos[pmId];
+ bool gFlag = false;
+ if (x < 0 || idims[0] < x+1) { // No need to check y
+ gFlag = true;
+ }
- out[omId] = in[iMem];
- }
+ const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+ + idy * ostrides[1] + idx;
+ if(gFlag) {
+ out[omId] = scalar<Ty>(offGrid);
+ } else {
+ dim_t ioff = idw * istrides[3] + idz * istrides[2]
+ + idy * istrides[1];
+ const dim_t iMem = round(x) + ioff;
+
+ out[omId] = in[iMem];
}
- };
+ }
+};
- template<typename Ty, typename Tp>
- struct approx1_op<Ty, Tp, AF_INTERP_LINEAR>
+template<typename Ty, typename Tp>
+struct approx1_op<Ty, Tp, AF_INTERP_LINEAR>
+{
+ void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+ const Ty *in, const af::dim4 &idims, const dim_t iElems,
+ const Tp *pos, const af::dim4 &pdims,
+ const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
+ const float offGrid, const bool pBatch,
+ const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
{
- void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims,
- const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
- const float offGrid, const bool pBatch,
- const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
- {
- dim_t pmId = idx;
- if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
-
- const Tp x = pos[pmId];
- bool gFlag = false;
- if (x < 0 || idims[0] < x+1) {
- gFlag = true;
- }
+ dim_t pmId = idx;
+ if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1];
- const dim_t grid_x = floor(x); // nearest grid
- const Tp off_x = x - grid_x; // fractional offset
-
- const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
- + idy * ostrides[1] + idx;
- if(gFlag) {
- out[omId] = scalar<Ty>(offGrid);
- } else {
- dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
-
- // Check if x and x + 1 are both valid indices
- bool cond = (x < idims[0] - 1);
- // Compute Left and Right Weighted Values
- Ty yl = ((Tp)1.0 - off_x) * in[ioff];
- Ty yr = cond ? (off_x) * in[ioff + 1] : scalar<Ty>(0);
- Ty yo = yl + yr;
- // Compute Weight used
- Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x);
- // Write final value
- out[omId] = (yo / wt);
- }
+ const Tp x = pos[pmId];
+ bool gFlag = false;
+ if (x < 0 || idims[0] < x+1) {
+ gFlag = true;
}
- };
-
- template<typename Ty, typename Tp, af_interp_type method>
- void approx1_(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims,
- const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides,
- const float offGrid)
- {
- approx1_op<Ty, Tp, method> op;
- bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1);
-
- for(dim_t w = 0; w < odims[3]; w++) {
- for(dim_t z = 0; z < odims[2]; z++) {
- for(dim_t y = 0; y < odims[1]; y++) {
- for(dim_t x = 0; x < odims[0]; x++) {
- op(out, odims, oElems, in, idims, iElems, pos, pdims,
- ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w);
- }
+
+ const dim_t grid_x = floor(x); // nearest grid
+ const Tp off_x = x - grid_x; // fractional offset
+
+ const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+ + idy * ostrides[1] + idx;
+ if(gFlag) {
+ out[omId] = scalar<Ty>(offGrid);
+ } else {
+ dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x;
+
+ // Check if x and x + 1 are both valid indices
+ bool cond = (x < idims[0] - 1);
+ // Compute Left and Right Weighted Values
+ Ty yl = ((Tp)1.0 - off_x) * in[ioff];
+ Ty yr = cond ? (off_x) * in[ioff + 1] : scalar<Ty>(0);
+ Ty yo = yl + yr;
+ // Compute Weight used
+ Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x);
+ // Write final value
+ out[omId] = (yo / wt);
+ }
+ }
+};
+
+template<typename Ty, typename Tp, af_interp_type method>
+void approx1_(Array<Ty> output, Array<Ty> const input,
+ Array<Tp> const position, float const offGrid)
+{
+ Ty * out = output.get();
+ Ty const * const in = input.get();
+ Tp const * const pos = position.get();
+ dim4 const odims = output.dims();
+ dim4 const idims = input.dims();
+ dim4 const pdims = position.dims();
+ dim4 const ostrides = output.strides();
+ dim4 const istrides = input.strides();
+ dim4 const pstrides = position.strides();
+ dim_t const oElems = output.elements();
+ dim_t const iElems = input.elements();
+
+ approx1_op<Ty, Tp, method> op;
+ bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1);
+
+ for(dim_t w = 0; w < odims[3]; w++) {
+ for(dim_t z = 0; z < odims[2]; z++) {
+ for(dim_t y = 0; y < odims[1]; y++) {
+ for(dim_t x = 0; x < odims[0]; x++) {
+ op(out, odims, oElems, in, idims, iElems, pos, pdims,
+ ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w);
}
}
}
}
+}
- template<typename Ty, typename Tp>
- Array<Ty> approx1(const Array<Ty> &in, const Array<Tp> &pos,
- const af_interp_type method, const float offGrid)
- {
- in.eval();
- pos.eval();
-
- af::dim4 odims = in.dims();
- odims[0] = pos.dims()[0];
-
- // Create output placeholder
- Array<Ty> out = createEmptyArray<Ty>(odims);
-
- switch(method) {
- case AF_INTERP_NEAREST:
- getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_NEAREST>,
- out.get(), out.dims(), out.elements(),
- in.get(), in.dims(), in.elements(), pos.get(), pos.dims(),
- out.strides(), in.strides(), pos.strides(), offGrid);
- break;
- case AF_INTERP_LINEAR:
- getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_LINEAR>,
- out.get(), out.dims(), out.elements(),
- in.get(), in.dims(), in.elements(), pos.get(), pos.dims(),
- out.strides(), in.strides(), pos.strides(), offGrid);
- break;
- default:
- break;
- }
- return out;
+template<typename Ty, typename Tp>
+Array<Ty> approx1(const Array<Ty> &in, const Array<Tp> &pos,
+ const af_interp_type method, const float offGrid)
+{
+ in.eval();
+ pos.eval();
+
+ af::dim4 odims = in.dims();
+ odims[0] = pos.dims()[0];
+
+ // Create output placeholder
+ Array<Ty> out = createEmptyArray<Ty>(odims);
+
+ switch(method) {
+ case AF_INTERP_NEAREST:
+ getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_NEAREST>,
+ out, in, pos, offGrid);
+ break;
+ case AF_INTERP_LINEAR:
+ getQueue().enqueue(approx1_<Ty, Tp, AF_INTERP_LINEAR>,
+ out, in, pos, offGrid);
+ break;
+ default:
+ break;
}
+ return out;
+}
- ///////////////////////////////////////////////////////////////////////////
- // Approx2
- ///////////////////////////////////////////////////////////////////////////
- template<typename Ty, typename Tp, af_interp_type method>
- struct approx2_op
+///////////////////////////////////////////////////////////////////////////
+// Approx2
+///////////////////////////////////////////////////////////////////////////
+template<typename Ty, typename Tp, af_interp_type method>
+struct approx2_op
+{
+ void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+ const Ty *in, const af::dim4 &idims, const dim_t iElems,
+ const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
+ const af::dim4 &ostrides, const af::dim4 &istrides,
+ const af::dim4 &pstrides, const af::dim4 &qstrides,
+ const float offGrid, const bool pBatch,
+ const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
{
- void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const af::dim4 &pstrides, const af::dim4 &qstrides,
- const float offGrid, const bool pBatch,
- const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
- {
- return;
- }
- };
+ return;
+ }
+};
- template<typename Ty, typename Tp>
- struct approx2_op<Ty, Tp, AF_INTERP_NEAREST>
+template<typename Ty, typename Tp>
+struct approx2_op<Ty, Tp, AF_INTERP_NEAREST>
+{
+ void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+ const Ty *in, const af::dim4 &idims, const dim_t iElems,
+ const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
+ const af::dim4 &ostrides, const af::dim4 &istrides,
+ const af::dim4 &pstrides, const af::dim4 &qstrides,
+ const float offGrid, const bool pBatch,
+ const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
{
- void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const af::dim4 &pstrides, const af::dim4 &qstrides,
- const float offGrid, const bool pBatch,
- const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
- {
- dim_t pmId = idy * pstrides[1] + idx;
- dim_t qmId = idy * qstrides[1] + idx;
- if(pBatch) {
- pmId += idw * pstrides[3] + idz * pstrides[2];
- qmId += idw * qstrides[3] + idz * qstrides[2];
- }
+ dim_t pmId = idy * pstrides[1] + idx;
+ dim_t qmId = idy * qstrides[1] + idx;
+ if(pBatch) {
+ pmId += idw * pstrides[3] + idz * pstrides[2];
+ qmId += idw * qstrides[3] + idz * qstrides[2];
+ }
- bool gFlag = false;
- const Tp x = pos[pmId], y = qos[qmId];
- if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
- gFlag = true;
- }
+ bool gFlag = false;
+ const Tp x = pos[pmId], y = qos[qmId];
+ if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
+ gFlag = true;
+ }
- const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
- + idy * ostrides[1] + idx;
- if(gFlag) {
- out[omId] = scalar<Ty>(offGrid);
- } else {
- const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
- const dim_t imId = idw * istrides[3] + idz * istrides[2] +
- grid_y * istrides[1] + grid_x;
- out[omId] = in[imId];
- }
+ const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+ + idy * ostrides[1] + idx;
+ if(gFlag) {
+ out[omId] = scalar<Ty>(offGrid);
+ } else {
+ const dim_t grid_x = round(x), grid_y = round(y); // nearest grid
+ const dim_t imId = idw * istrides[3] + idz * istrides[2] +
+ grid_y * istrides[1] + grid_x;
+ out[omId] = in[imId];
}
- };
+ }
+};
- template<typename Ty, typename Tp>
- struct approx2_op<Ty, Tp, AF_INTERP_LINEAR>
+template<typename Ty, typename Tp>
+struct approx2_op<Ty, Tp, AF_INTERP_LINEAR>
+{
+ void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
+ const Ty *in, const af::dim4 &idims, const dim_t iElems,
+ const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
+ const af::dim4 &ostrides, const af::dim4 &istrides,
+ const af::dim4 &pstrides, const af::dim4 &qstrides,
+ const float offGrid, const bool pBatch,
+ const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
{
- void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const af::dim4 &pstrides, const af::dim4 &qstrides,
- const float offGrid, const bool pBatch,
- const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw)
- {
- dim_t pmId = idy * pstrides[1] + idx;
- dim_t qmId = idy * qstrides[1] + idx;
- if(pBatch) {
- pmId += idw * pstrides[3] + idz * pstrides[2];
- qmId += idw * qstrides[3] + idz * qstrides[2];
- }
+ dim_t pmId = idy * pstrides[1] + idx;
+ dim_t qmId = idy * qstrides[1] + idx;
+ if(pBatch) {
+ pmId += idw * pstrides[3] + idz * pstrides[2];
+ qmId += idw * qstrides[3] + idz * qstrides[2];
+ }
- bool gFlag = false;
- const Tp x = pos[pmId], y = qos[qmId];
- if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
- gFlag = true;
- }
+ bool gFlag = false;
+ const Tp x = pos[pmId], y = qos[qmId];
+ if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) {
+ gFlag = true;
+ }
- const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid
- const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset
+ const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid
+ const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset
- // Check if pVal and pVal + 1 are both valid indices
- bool condY = (y < idims[1] - 1);
- bool condX = (x < idims[0] - 1);
+ // Check if pVal and pVal + 1 are both valid indices
+ bool condY = (y < idims[1] - 1);
+ bool condX = (x < idims[0] - 1);
- // Compute wieghts used
- Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y);
- Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0;
- Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0;
- Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0;
+ // Compute wieghts used
+ Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y);
+ Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0;
+ Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0;
+ Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0;
- Tp wt = wt00 + wt10 + wt01 + wt11;
- Ty zero = scalar<Ty>(0);
+ Tp wt = wt00 + wt10 + wt01 + wt11;
+ Ty zero = scalar<Ty>(0);
- const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
- + idy * ostrides[1] + idx;
- if(gFlag) {
- out[omId] = scalar<Ty>(offGrid);
- } else {
- dim_t ioff = idw * istrides[3] + idz * istrides[2]
- + grid_y * istrides[1] + grid_x;
+ const dim_t omId = idw * ostrides[3] + idz * ostrides[2]
+ + idy * ostrides[1] + idx;
+ if(gFlag) {
+ out[omId] = scalar<Ty>(offGrid);
+ } else {
+ dim_t ioff = idw * istrides[3] + idz * istrides[2]
+ + grid_y * istrides[1] + grid_x;
- // Compute Weighted Values
- Ty y00 = wt00 * in[ioff];
- Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero;
- Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero;
- Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
+ // Compute Weighted Values
+ Ty y00 = wt00 * in[ioff];
+ Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero;
+ Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero;
+ Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero;
- Ty yo = y00 + y10 + y01 + y11;
+ Ty yo = y00 + y10 + y01 + y11;
- // Write Final Value
- out[omId] = (yo / wt);
- }
+ // Write Final Value
+ out[omId] = (yo / wt);
}
- };
-
- template<typename Ty, typename Tp, af_interp_type method>
- void approx2_(Ty *out, const af::dim4 &odims, const dim_t oElems,
- const Ty *in, const af::dim4 &idims, const dim_t iElems,
- const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const af::dim4 &pstrides, const af::dim4 &qstrides,
- const float offGrid)
- {
- approx2_op<Ty, Tp, method> op;
- bool pBatch = !(pdims[2] == 1 && pdims[3] == 1);
-
- for(dim_t w = 0; w < odims[3]; w++) {
- for(dim_t z = 0; z < odims[2]; z++) {
- for(dim_t y = 0; y < odims[1]; y++) {
- for(dim_t x = 0; x < odims[0]; x++) {
- op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
- ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w);
- }
+ }
+};
+
+template<typename Ty, typename Tp, af_interp_type method>
+void approx2_(Array<Ty> output, Array<Ty> const input,
+ Array<Tp> const position, Array<Tp> const qosition,
+ float const offGrid)
+{
+ Ty * out = output.get();
+ Ty const * const in = input.get();
+ Tp const * const pos = position.get();
+ Tp const * const qos = qosition.get();
+ dim4 const odims = output.dims();
+ dim4 const idims = input.dims();
+ dim4 const pdims = position.dims();
+ dim4 const qdims = qosition.dims();
+ dim4 const ostrides = output.strides();
+ dim4 const istrides = input.strides();
+ dim4 const pstrides = position.strides();
+ dim4 const qstrides = qosition.strides();
+ dim_t const oElems = output.elements();
+ dim_t const iElems = input.elements();
+
+ approx2_op<Ty, Tp, method> op;
+ bool pBatch = !(pdims[2] == 1 && pdims[3] == 1);
+
+ for(dim_t w = 0; w < odims[3]; w++) {
+ for(dim_t z = 0; z < odims[2]; z++) {
+ for(dim_t y = 0; y < odims[1]; y++) {
+ for(dim_t x = 0; x < odims[0]; x++) {
+ op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims,
+ ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w);
}
}
}
}
+}
- template<typename Ty, typename Tp>
- Array<Ty> approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1,
- const af_interp_type method, const float offGrid)
- {
- in.eval();
- pos0.eval();
- pos1.eval();
-
- af::dim4 odims = in.dims();
- odims[0] = pos0.dims()[0];
- odims[1] = pos0.dims()[1];
-
- // Create output placeholder
- Array<Ty> out = createEmptyArray<Ty>(odims);
-
- switch(method) {
- case AF_INTERP_NEAREST:
- getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_NEAREST>,
- out.get(), out.dims(), out.elements(),
- in.get(), in.dims(), in.elements(),
- pos0.get(), pos0.dims(), pos1.get(), pos1.dims(),
- out.strides(), in.strides(), pos0.strides(), pos1.strides(),
- offGrid);
- break;
- case AF_INTERP_LINEAR:
- getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_LINEAR>,
- out.get(), out.dims(), out.elements(),
- in.get(), in.dims(), in.elements(),
- pos0.get(), pos0.dims(), pos1.get(), pos1.dims(),
- out.strides(), in.strides(), pos0.strides(), pos1.strides(),
- offGrid);
- break;
- default:
- break;
- }
- return out;
+template<typename Ty, typename Tp>
+Array<Ty> approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1,
+ const af_interp_type method, const float offGrid)
+{
+ in.eval();
+ pos0.eval();
+ pos1.eval();
+
+ af::dim4 odims = in.dims();
+ odims[0] = pos0.dims()[0];
+ odims[1] = pos0.dims()[1];
+
+ Array<Ty> out = createEmptyArray<Ty>(odims);
+
+ switch(method) {
+ case AF_INTERP_NEAREST:
+ getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_NEAREST>,
+ out, in, pos0, pos1, offGrid);
+ break;
+ case AF_INTERP_LINEAR:
+ getQueue().enqueue(approx2_<Ty, Tp, AF_INTERP_LINEAR>,
+ out, in, pos0, pos1, offGrid);
+ break;
+ default:
+ break;
}
+ return out;
+}
#define INSTANTIATE(Ty, Tp) \
template Array<Ty> approx1<Ty, Tp>(const Array<Ty> &in, const Array<Tp> &pos, \
@@ -349,8 +358,9 @@ namespace cpu
const Array<Tp> &pos1, const af_interp_type method, \
const float offGrid); \
- INSTANTIATE(float , float )
- INSTANTIATE(double , double)
- INSTANTIATE(cfloat , float )
- INSTANTIATE(cdouble, double)
+INSTANTIATE(float , float )
+INSTANTIATE(double , double)
+INSTANTIATE(cfloat , float )
+INSTANTIATE(cdouble, double)
+
}
diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp
index 10856f7..ea38ea7 100644
--- a/src/backend/cpu/bilateral.cpp
+++ b/src/backend/cpu/bilateral.cpp
@@ -99,6 +99,7 @@ void bilateral_(Array<outType> out, const Array<inType> in, float s_sigma, float
template<typename inType, typename outType, bool isColor>
Array<outType> bilateral(const Array<inType> &in, const float &s_sigma, const float &c_sigma)
{
+ in.eval();
const dim4 dims = in.dims();
Array<outType> out = createEmptyArray<outType>(dims);
getQueue().enqueue(bilateral_<inType, outType, isColor>, out, in, s_sigma, c_sigma);
diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp
index d0bd3c8..ce11867 100644
--- a/src/backend/cpu/cholesky.cpp
+++ b/src/backend/cpu/cholesky.cpp
@@ -47,6 +47,8 @@ CH_FUNC(potrf , cdouble, z)
template<typename T>
Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
{
+ in.eval();
+
Array<T> out = copyArray<T>(in);
*info = cholesky_inplace(out, is_upper);
@@ -59,6 +61,8 @@ Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper)
template<typename T>
int cholesky_inplace(Array<T> &in, const bool is_upper)
{
+ in.eval();
+
dim4 iDims = in.dims();
int N = iDims[0];
diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp
index 5240360..eef5e0e 100644
--- a/src/backend/cpu/copy.cpp
+++ b/src/backend/cpu/copy.cpp
@@ -23,144 +23,144 @@
namespace cpu
{
- template<typename T>
- static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim)
- {
- if(dim == 0) {
- if(strides[dim] == 1) {
- //FIXME: Check for errors / exceptions
- memcpy(dst, src, dims[dim] * sizeof(T));
- } else {
- for(dim_t i = 0; i < dims[dim]; i++) {
- dst[i] = src[strides[dim]*i];
- }
- }
+
+template<typename T>
+static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim)
+{
+ if(dim == 0) {
+ if(strides[dim] == 1) {
+ //FIXME: Check for errors / exceptions
+ memcpy(dst, src, dims[dim] * sizeof(T));
} else {
- for(dim_t i = dims[dim]; i > 0; i--) {
- stridedCopy<T>(dst, ostrides, src, dims, strides, dim - 1);
- src += strides[dim];
- dst += ostrides[dim];
+ for(dim_t i = 0; i < dims[dim]; i++) {
+ dst[i] = src[strides[dim]*i];
}
}
- }
-
- // Assigns to single elements
- template<typename T>
- void copyData(T *to, const Array<T> &from)
- {
- from.eval();
- getQueue().sync();
- if(from.isOwner()) {
- // FIXME: Check for errors / exceptions
- memcpy(to, from.get(), from.elements()*sizeof(T));
- } else {
- dim4 ostrides = calcStrides(from.dims());
- stridedCopy<T>(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1);
+ } else {
+ for(dim_t i = dims[dim]; i > 0; i--) {
+ stridedCopy<T>(dst, ostrides, src, dims, strides, dim - 1);
+ src += strides[dim];
+ dst += ostrides[dim];
}
}
+}
- template<typename T>
- Array<T> copyArray(const Array<T> &A)
- {
- Array<T> out = createEmptyArray<T>(A.dims());
- copyData(out.get(), A);
- return out;
+// Assigns to single elements
+template<typename T>
+void copyData(T *to, const Array<T> &from)
+{
+ from.eval();
+ getQueue().sync();
+ if(from.isOwner()) {
+ // FIXME: Check for errors / exceptions
+ memcpy(to, from.get(), from.elements()*sizeof(T));
+ } else {
+ dim4 ostrides = calcStrides(from.dims());
+ stridedCopy<T>(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1);
}
+}
- template<typename inType, typename outType>
- static void copy(Array<outType> dst, const Array<inType> src, outType default_value, double factor)
- {
- dim4 src_dims = src.dims();
- dim4 dst_dims = dst.dims();
- dim4 src_strides = src.strides();
- dim4 dst_strides = dst.strides();
+template<typename T>
+Array<T> copyArray(const Array<T> &A)
+{
+ Array<T> out = createEmptyArray<T>(A.dims());
+ copyData(out.get(), A);
+ return out;
+}
- const inType * src_ptr = src.get();
- outType * dst_ptr = dst.get();
+template<typename inType, typename outType>
+static void copy(Array<outType> dst, const Array<inType> src, outType default_value, double factor)
+{
+ dim4 src_dims = src.dims();
+ dim4 dst_dims = dst.dims();
+ dim4 src_strides = src.strides();
+ dim4 dst_strides = dst.strides();
- dim_t trgt_l = std::min(dst_dims[3], src_dims[3]);
- dim_t trgt_k = std::min(dst_dims[2], src_dims[2]);
- dim_t trgt_j = std::min(dst_dims[1], src_dims[1]);
- dim_t trgt_i = std::min(dst_dims[0], src_dims[0]);
+ const inType * src_ptr = src.get();
+ outType * dst_ptr = dst.get();
- for(dim_t l=0; l<dst_dims[3]; ++l) {
+ dim_t trgt_l = std::min(dst_dims[3], src_dims[3]);
+ dim_t trgt_k = std::min(dst_dims[2], src_dims[2]);
+ dim_t trgt_j = std::min(dst_dims[1], src_dims[1]);
+ dim_t trgt_i = std::min(dst_dims[0], src_dims[0]);
- dim_t src_loff = l*src_strides[3];
- dim_t dst_loff = l*dst_strides[3];
- bool isLvalid = l<trgt_l;
+ for(dim_t l=0; l<dst_dims[3]; ++l) {
- for(dim_t k=0; k<dst_dims[2]; ++k) {
+ dim_t src_loff = l*src_strides[3];
+ dim_t dst_loff = l*dst_strides[3];
+ bool isLvalid = l<trgt_l;
- dim_t src_koff = k*src_strides[2];
- dim_t dst_koff = k*dst_strides[2];
- bool isKvalid = k<trgt_k;
+ for(dim_t k=0; k<dst_dims[2]; ++k) {
- for(dim_t j=0; j<dst_dims[1]; ++j) {
+ dim_t src_koff = k*src_strides[2];
+ dim_t dst_koff = k*dst_strides[2];
+ bool isKvalid = k<trgt_k;
- dim_t src_joff = j*src_strides[1];
- dim_t dst_joff = j*dst_strides[1];
- bool isJvalid = j<trgt_j;
+ for(dim_t j=0; j<dst_dims[1]; ++j) {
- for(dim_t i=0; i<dst_dims[0]; ++i) {
- outType temp = default_value;
- if (isLvalid && isKvalid && isJvalid && i<trgt_i) {
- dim_t src_idx = i*src_strides[0] + src_joff + src_koff + src_loff;
- temp = outType(src_ptr[src_idx])*outType(factor);
- }
- dim_t dst_idx = i*dst_strides[0] + dst_joff + dst_koff + dst_loff;
- dst_ptr[dst_idx] = temp;
+ dim_t src_joff = j*src_strides[1];
+ dim_t dst_joff = j*dst_strides[1];
+ bool isJvalid = j<trgt_j;
+
+ for(dim_t i=0; i<dst_dims[0]; ++i) {
+ outType temp = default_value;
+ if (isLvalid && isKvalid && isJvalid && i<trgt_i) {
+ dim_t src_idx = i*src_strides[0] + src_joff + src_koff + src_loff;
+ temp = outType(src_ptr[src_idx])*outType(factor);
}
+ dim_t dst_idx = i*dst_strides[0] + dst_joff + dst_koff + dst_loff;
+ dst_ptr[dst_idx] = temp;
}
}
}
}
+}
- template<typename T>
- void multiply_inplace(Array<T> &in, double val)
- {
- in.eval();
- getQueue().enqueue(copy<T, T>, in, in, 0, val);
- }
-
- template<typename inType, typename outType>
- Array<outType> padArray(Array<inType> const &in, dim4 const &dims,
- outType default_value, double factor)
- {
- Array<outType> ret = createValueArray<outType>(dims, default_value);
- ret.eval();
- in.eval();
- // FIXME:
- getQueue().sync();
- getQueue().enqueue(copy<inType, outType>, ret, in, outType(default_value), factor);
- return ret;
- }
+template<typename T>
+void multiply_inplace(Array<T> &in, double val)
+{
+ in.eval();
+ getQueue().enqueue(copy<T, T>, in, in, 0, val);
+}
- template<typename inType, typename outType>
- void copyArray(Array<outType> &out, Array<inType> const &in)
- {
- out.eval();
- in.eval();
- getQueue().enqueue(copy<inType, outType>, out, in, scalar<outType>(0), 1.0);
- }
+template<typename inType, typename outType>
+Array<outType> padArray(Array<inType> const &in, dim4 const &dims,
+ outType default_value, double factor)
+{
+ Array<outType> ret = createValueArray<outType>(dims, default_value);
+ ret.eval();
+ in.eval();
+ // FIXME:
+ getQueue().sync();
+ getQueue().enqueue(copy<inType, outType>, ret, in, outType(default_value), factor);
+ return ret;
+}
+template<typename inType, typename outType>
+void copyArray(Array<outType> &out, Array<inType> const &in)
+{
+ out.eval();
+ in.eval();
+ getQueue().enqueue(copy<inType, outType>, out, in, scalar<outType>(0), 1.0);
+}
#define INSTANTIATE(T) \
template void copyData<T> (T *data, const Array<T> &from); \
template Array<T> copyArray<T>(const Array<T> &A); \
template void multiply_inplace<T> (Array<T> &in, double norm); \
- INSTANTIATE(float )
- INSTANTIATE(double )
- INSTANTIATE(cfloat )
- INSTANTIATE(cdouble)
- INSTANTIATE(int )
- INSTANTIATE(uint )
- INSTANTIATE(uchar )
- INSTANTIATE(char )
- INSTANTIATE(intl )
- INSTANTIATE(uintl )
- INSTANTIATE(short )
- INSTANTIATE(ushort )
+INSTANTIATE(float )
+INSTANTIATE(double )
+INSTANTIATE(cfloat )
+INSTANTIATE(cdouble)
+INSTANTIATE(int )
+INSTANTIATE(uint )
+INSTANTIATE(uchar )
+INSTANTIATE(char )
+INSTANTIATE(intl )
+INSTANTIATE(uintl )
+INSTANTIATE(short )
+INSTANTIATE(ushort )
#define INSTANTIATE_PAD_ARRAY(SRC_T) \
@@ -189,16 +189,16 @@ namespace cpu
template void copyArray<SRC_T, uchar >(Array<uchar > &dst, Array<SRC_T> const &src); \
template void copyArray<SRC_T, char >(Array<char > &dst, Array<SRC_T> const &src);
- INSTANTIATE_PAD_ARRAY(float )
- INSTANTIATE_PAD_ARRAY(double)
- INSTANTIATE_PAD_ARRAY(int )
- INSTANTIATE_PAD_ARRAY(uint )
- INSTANTIATE_PAD_ARRAY(intl )
- INSTANTIATE_PAD_ARRAY(uintl )
- INSTANTIATE_PAD_ARRAY(uchar )
- INSTANTIATE_PAD_ARRAY(char )
- INSTANTIATE_PAD_ARRAY(ushort)
- INSTANTIATE_PAD_ARRAY(short )
+INSTANTIATE_PAD_ARRAY(float )
+INSTANTIATE_PAD_ARRAY(double)
+INSTANTIATE_PAD_ARRAY(int )
+INSTANTIATE_PAD_ARRAY(uint )
+INSTANTIATE_PAD_ARRAY(intl )
+INSTANTIATE_PAD_ARRAY(uintl )
+INSTANTIATE_PAD_ARRAY(uchar )
+INSTANTIATE_PAD_ARRAY(char )
+INSTANTIATE_PAD_ARRAY(ushort)
+INSTANTIATE_PAD_ARRAY(short )
#define INSTANTIATE_PAD_ARRAY_COMPLEX(SRC_T) \
template Array<cfloat > padArray<SRC_T, cfloat >(Array<SRC_T> const &src, dim4 const &dims, cfloat default_value, double factor); \
@@ -206,8 +206,8 @@ namespace cpu
template void copyArray<SRC_T, cfloat >(Array<cfloat > &dst, Array<SRC_T> const &src); \
template void copyArray<SRC_T, cdouble >(Array<cdouble > &dst, Array<SRC_T> const &src);
- INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat )
- INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble)
+INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat )
+INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble)
#define SPECILIAZE_UNUSED_COPYARRAY(SRC_T, DST_T) \
template<> void copyArray<SRC_T, DST_T>(Array<DST_T> &out, Array<SRC_T> const &in) \
@@ -215,25 +215,25 @@ namespace cpu
CPU_NOT_SUPPORTED();\
}
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , double)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , float)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , char)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , int)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , short)
- SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, char)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
- SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , double)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , float)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , char)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , int)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , short)
+SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, double)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, float)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, char)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, int)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, short)
+SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort)
}
diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp
index 856ed6e..9af7845 100644
--- a/src/backend/cpu/diagonal.cpp
+++ b/src/backend/cpu/diagonal.cpp
@@ -20,87 +20,88 @@
namespace cpu
{
- template<typename T>
- Array<T> diagCreate(const Array<T> &in, const int num)
- {
- in.eval();
-
- int size = in.dims()[0] + std::abs(num);
- int batch = in.dims()[1];
- Array<T> out = createEmptyArray<T>(dim4(size, size, batch));
-
- auto func = [=] (Array<T> out, const Array<T> in) {
- const T *iptr = in.get();
- T *optr = out.get();
-
- for (int k = 0; k < batch; k++) {
- for (int j = 0; j < size; j++) {
- for (int i = 0; i < size; i++) {
- T val = scalar<T>(0);
- if (i == j - num) {
- val = (num > 0) ? iptr[i] : iptr[j];
- }
- optr[i + j * out.strides()[1]] = val;
+
+template<typename T>
+Array<T> diagCreate(const Array<T> &in, const int num)
+{
+ in.eval();
+
+ int size = in.dims()[0] + std::abs(num);
+ int batch = in.dims()[1];
+ Array<T> out = createEmptyArray<T>(dim4(size, size, batch));
+
+ auto func = [=] (Array<T> out, const Array<T> in) {
+ const T *iptr = in.get();
+ T *optr = out.get();
+
+ for (int k = 0; k < batch; k++) {
+ for (int j = 0; j < size; j++) {
+ for (int i = 0; i < size; i++) {
+ T val = scalar<T>(0);
+ if (i == j - num) {
+ val = (num > 0) ? iptr[i] : iptr[j];
}
+ optr[i + j * out.strides()[1]] = val;
}
- optr += out.strides()[2];
- iptr += in.strides()[1];
}
- };
- getQueue().enqueue(func, out, in);
+ optr += out.strides()[2];
+ iptr += in.strides()[1];
+ }
+ };
+ getQueue().enqueue(func, out, in);
- return out;
- }
+ return out;
+}
- template<typename T>
- Array<T> diagExtract(const Array<T> &in, const int num)
- {
- in.eval();
+template<typename T>
+Array<T> diagExtract(const Array<T> &in, const int num)
+{
+ in.eval();
- const dim4 idims = in.dims();
- dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
- Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));
+ const dim4 idims = in.dims();
+ dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
+ Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));
- auto func = [=] (Array<T> out, const Array<T> in) {
- const dim4 odims = out.dims();
+ auto func = [=] (Array<T> out, const Array<T> in) {
+ const dim4 odims = out.dims();
- const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num);
+ const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num);
- for (int l = 0; l < (int)odims[3]; l++) {
+ for (int l = 0; l < (int)odims[3]; l++) {
- for (int k = 0; k < (int)odims[2]; k++) {
- const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off;
- T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2];
+ for (int k = 0; k < (int)odims[2]; k++) {
+ const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off;
+ T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2];
- for (int i = 0; i < (int)odims[0]; i++) {
- T val = scalar<T>(0);
- if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i];
- optr[i] = val;
- }
+ for (int i = 0; i < (int)odims[0]; i++) {
+ T val = scalar<T>(0);
+ if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i];
+ optr[i] = val;
}
}
- };
+ }
+ };
- getQueue().enqueue(func, out, in);
+ getQueue().enqueue(func, out, in);
- return out;
- }
+ return out;
+}
#define INSTANTIATE_DIAGONAL(T) \
template Array<T> diagExtract<T> (const Array<T> &in, const int num); \
template Array<T> diagCreate <T> (const Array<T> &in, const int num);
- INSTANTIATE_DIAGONAL(float)
- INSTANTIATE_DIAGONAL(double)
- INSTANTIATE_DIAGONAL(cfloat)
- INSTANTIATE_DIAGONAL(cdouble)
- INSTANTIATE_DIAGONAL(int)
- INSTANTIATE_DIAGONAL(uint)
- INSTANTIATE_DIAGONAL(intl)
- INSTANTIATE_DIAGONAL(uintl)
- INSTANTIATE_DIAGONAL(char)
- INSTANTIATE_DIAGONAL(uchar)
- INSTANTIATE_DIAGONAL(short)
- INSTANTIATE_DIAGONAL(ushort)
+INSTANTIATE_DIAGONAL(float)
+INSTANTIATE_DIAGONAL(double)
+INSTANTIATE_DIAGONAL(cfloat)
+INSTANTIATE_DIAGONAL(cdouble)
+INSTANTIATE_DIAGONAL(int)
+INSTANTIATE_DIAGONAL(uint)
+INSTANTIATE_DIAGONAL(intl)
+INSTANTIATE_DIAGONAL(uintl)
+INSTANTIATE_DIAGONAL(char)
+INSTANTIATE_DIAGONAL(uchar)
+INSTANTIATE_DIAGONAL(short)
+INSTANTIATE_DIAGONAL(ushort)
}
diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp
index 321dae7..8f9c0f1 100644
--- a/src/backend/cpu/diff.cpp
+++ b/src/backend/cpu/diff.cpp
@@ -16,119 +16,122 @@
namespace cpu
{
- unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0)
- {
- return (l * strides[3] +
- k * strides[2] +
- j * strides[1] +
- i);
- }
-
- template<typename T>
- Array<T> diff1(const Array<T> &in, const int dim)
- {
- // Bool for dimension
- bool is_dim0 = dim == 0;
- bool is_dim1 = dim == 1;
- bool is_dim2 = dim == 2;
- bool is_dim3 = dim == 3;
-
- // Decrement dimension of select dimension
- af::dim4 dims = in.dims();
- dims[dim]--;
-
- // Create output placeholder
- Array<T> outArray = createEmptyArray<T>(dims);
-
- auto func = [=] (Array<T> outArray, Array<T> in) {
- // Get pointers to raw data
- const T *inPtr = in.get();
- T *outPtr = outArray.get();
-
- // TODO: Improve this
- for(dim_t l = 0; l < dims[3]; l++) {
- for(dim_t k = 0; k < dims[2]; k++) {
- for(dim_t j = 0; j < dims[1]; j++) {
- for(dim_t i = 0; i < dims[0]; i++) {
- // Operation: out[index] = in[index + 1 * dim_size] - in[index]
- int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
- int jdx = getIdx(in.strides(), in.offsets(),
- i + is_dim0, j + is_dim1,
- k + is_dim2, l + is_dim3);
- int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
- outPtr[odx] = inPtr[jdx] - inPtr[idx];
- }
+
+unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0)
+{
+ return (l * strides[3] +
+ k * strides[2] +
+ j * strides[1] +
+ i);
+}
+
+template<typename T>
+Array<T> diff1(const Array<T> &in, const int dim)
+{
+ in.eval();
+ // Bool for dimension
+ bool is_dim0 = dim == 0;
+ bool is_dim1 = dim == 1;
+ bool is_dim2 = dim == 2;
+ bool is_dim3 = dim == 3;
+
+ // Decrement dimension of select dimension
+ af::dim4 dims = in.dims();
+ dims[dim]--;
+
+ // Create output placeholder
+ Array<T> outArray = createEmptyArray<T>(dims);
+
+ auto func = [=] (Array<T> outArray, Array<T> in) {
+ // Get pointers to raw data
+ const T *inPtr = in.get();
+ T *outPtr = outArray.get();
+
+ // TODO: Improve this
+ for(dim_t l = 0; l < dims[3]; l++) {
+ for(dim_t k = 0; k < dims[2]; k++) {
+ for(dim_t j = 0; j < dims[1]; j++) {
+ for(dim_t i = 0; i < dims[0]; i++) {
+ // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+ int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
+ int jdx = getIdx(in.strides(), in.offsets(),
+ i + is_dim0, j + is_dim1,
+ k + is_dim2, l + is_dim3);
+ int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
+ outPtr[odx] = inPtr[jdx] - inPtr[idx];
}
}
}
- };
- getQueue().enqueue(func, outArray, in);
-
- return outArray;
- }
-
- template<typename T>
- Array<T> diff2(const Array<T> &in, const int dim)
- {
- // Bool for dimension
- bool is_dim0 = dim == 0;
- bool is_dim1 = dim == 1;
- bool is_dim2 = dim == 2;
- bool is_dim3 = dim == 3;
-
- // Decrement dimension of select dimension
- af::dim4 dims = in.dims();
- dims[dim] -= 2;
-
- // Create output placeholder
- Array<T> outArray = createEmptyArray<T>(dims);
-
- auto func = [=] (Array<T> outArray, Array<T> in) {
- // Get pointers to raw data
- const T *inPtr = in.get();
- T *outPtr = outArray.get();
-
- // TODO: Improve this
- for(dim_t l = 0; l < dims[3]; l++) {
- for(dim_t k = 0; k < dims[2]; k++) {
- for(dim_t j = 0; j < dims[1]; j++) {
- for(dim_t i = 0; i < dims[0]; i++) {
- // Operation: out[index] = in[index + 1 * dim_size] - in[index]
- int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
- int jdx = getIdx(in.strides(), in.offsets(),
- i + is_dim0, j + is_dim1,
- k + is_dim2, l + is_dim3);
- int kdx = getIdx(in.strides(), in.offsets(),
- i + 2 * is_dim0, j + 2 * is_dim1,
- k + 2 * is_dim2, l + 2 * is_dim3);
- int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
- outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
- }
+ }
+ };
+ getQueue().enqueue(func, outArray, in);
+
+ return outArray;
+}
+
+template<typename T>
+Array<T> diff2(const Array<T> &in, const int dim)
+{
+ in.eval();
+ // Bool for dimension
+ bool is_dim0 = dim == 0;
+ bool is_dim1 = dim == 1;
+ bool is_dim2 = dim == 2;
+ bool is_dim3 = dim == 3;
+
+ // Decrement dimension of select dimension
+ af::dim4 dims = in.dims();
+ dims[dim] -= 2;
+
+ // Create output placeholder
+ Array<T> outArray = createEmptyArray<T>(dims);
+
+ auto func = [=] (Array<T> outArray, Array<T> in) {
+ // Get pointers to raw data
+ const T *inPtr = in.get();
+ T *outPtr = outArray.get();
+
+ // TODO: Improve this
+ for(dim_t l = 0; l < dims[3]; l++) {
+ for(dim_t k = 0; k < dims[2]; k++) {
+ for(dim_t j = 0; j < dims[1]; j++) {
+ for(dim_t i = 0; i < dims[0]; i++) {
+ // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+ int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
+ int jdx = getIdx(in.strides(), in.offsets(),
+ i + is_dim0, j + is_dim1,
+ k + is_dim2, l + is_dim3);
+ int kdx = getIdx(in.strides(), in.offsets(),
+ i + 2 * is_dim0, j + 2 * is_dim1,
+ k + 2 * is_dim2, l + 2 * is_dim3);
+ int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
+ outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
}
}
}
- };
+ }
+ };
- getQueue().enqueue(func, outArray, in);
+ getQueue().enqueue(func, outArray, in);
- return outArray;
- }
+ return outArray;
+}
#define INSTANTIATE(T) \
template Array<T> diff1<T> (const Array<T> &in, const int dim); \
template Array<T> diff2<T> (const Array<T> &in, const int dim); \
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(ushort)
+INSTANTIATE(short)
- INSTANTIATE(float)
- INSTANTIATE(double)
- INSTANTIATE(cfloat)
- INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
- INSTANTIATE(uchar)
- INSTANTIATE(char)
- INSTANTIATE(ushort)
- INSTANTIATE(short)
}
diff --git a/src/backend/cpu/exampleFunction.cpp b/src/backend/cpu/exampleFunction.cpp
index a9e7bca..d45b8a2 100644
--- a/src/backend/cpu/exampleFunction.cpp
+++ b/src/backend/cpu/exampleFunction.cpp
@@ -24,6 +24,13 @@ namespace cpu
template<typename T>
Array<T> exampleFunction(const Array<T> &in, const af_someenum_t method)
{
+ in.eval(); // All input Arrays should call eval mandatorily
+ // in CPU backend function implementations. Since
+ // the cpu fns are asynchronous launches, any Arrays
+ // that are either views/JIT nodes needs to evaluated
+ // before they are passed onto functions that are
+ // enqueued onto the queues.
+
dim4 outputDims; // this should be '= in.dims();' in most cases
// but would definitely depend on the type of
// algorithm you are implementing.
diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp
index 7262e6d..e522954 100644
--- a/src/backend/cpu/fft.cpp
+++ b/src/backend/cpu/fft.cpp
@@ -95,6 +95,7 @@ void fft_inplace_(Array<T> in)
template<typename T, int rank, bool direction>
void fft_inplace(Array<T> &in)
{
+ in.eval();
getQueue().enqueue(fft_inplace_<T, rank, direction>, in);
}
@@ -165,6 +166,8 @@ void fft_r2c_(Array<Tc> out, const Array<Tr> in)
template<typename Tc, typename Tr, int rank>
Array<Tc> fft_r2c(const Array<Tr> &in)
{
+ in.eval();
+
dim4 odims = in.dims();
odims[0] = odims[0] / 2 + 1;
Array<Tc> out = createEmptyArray<Tc>(odims);
@@ -216,6 +219,8 @@ void fft_c2r_(Array<Tr> out, const Array<Tc> in, const dim4 odims)
template<typename Tr, typename Tc, int rank>
Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
{
+ in.eval();
+
Array<Tr> out = createEmptyArray<Tr>(odims);
getQueue().enqueue(fft_c2r_<Tr, Tc, rank>, out, in, odims);
@@ -243,4 +248,5 @@ Array<Tr> fft_c2r(const Array<Tc> &in, const dim4 &odims)
INSTANTIATE_REAL(float , cfloat )
INSTANTIATE_REAL(double, cdouble)
+
}
diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp
index 504c02a..06c15cf 100644
--- a/src/backend/cpu/gradient.cpp
+++ b/src/backend/cpu/gradient.cpp
@@ -101,4 +101,5 @@ INSTANTIATE(float)
INSTANTIATE(double)
INSTANTIATE(cfloat)
INSTANTIATE(cdouble)
+
}
diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp
index f7236bd..55c4417 100644
--- a/src/backend/cpu/identity.cpp
+++ b/src/backend/cpu/identity.cpp
@@ -18,6 +18,7 @@
namespace cpu
{
+
template<typename T>
Array<T> identity(const dim4& dims)
{
@@ -56,4 +57,5 @@ INSTANTIATE_IDENTITY(char)
INSTANTIATE_IDENTITY(uchar)
INSTANTIATE_IDENTITY(short)
INSTANTIATE_IDENTITY(ushort)
+
}
diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp
index 170b6a1..dcb85fa 100644
--- a/src/backend/cpu/iota.cpp
+++ b/src/backend/cpu/iota.cpp
@@ -21,6 +21,7 @@ using namespace std;
namespace cpu
{
+
///////////////////////////////////////////////////////////////////////////
// Kernel Functions
///////////////////////////////////////////////////////////////////////////
@@ -76,4 +77,5 @@ INSTANTIATE(uintl)
INSTANTIATE(uchar)
INSTANTIATE(short)
INSTANTIATE(ushort)
+
}
diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp
index e562bae..9858cba 100644
--- a/src/backend/cpu/ireduce.cpp
+++ b/src/backend/cpu/ireduce.cpp
@@ -21,178 +21,180 @@ using af::dim4;
namespace cpu
{
- template<typename T> double cabs(const T in) { return (double)in; }
- static double cabs(const char in) { return (double)(in > 0); }
- static double cabs(const cfloat &in) { return (double)abs(in); }
- static double cabs(const cdouble &in) { return (double)abs(in); }
- template<af_op_t op, typename T>
- struct MinMaxOp
+template<typename T> double cabs(const T in) { return (double)in; }
+static double cabs(const char in) { return (double)(in > 0); }
+static double cabs(const cfloat &in) { return (double)abs(in); }
+static double cabs(const cdouble &in) { return (double)abs(in); }
+
+template<af_op_t op, typename T>
+struct MinMaxOp
+{
+ T m_val;
+ uint m_idx;
+ MinMaxOp(T val, uint idx) :
+ m_val(val), m_idx(idx)
{
- T m_val;
- uint m_idx;
- MinMaxOp(T val, uint idx) :
- m_val(val), m_idx(idx)
- {
- }
+ }
- void operator()(T val, uint idx)
- {
- if (cabs(val) < cabs(m_val) ||
- (cabs(val) == cabs(m_val) &&
- idx > m_idx)) {
- m_val = val;
- m_idx = idx;
- }
+ void operator()(T val, uint idx)
+ {
+ if (cabs(val) < cabs(m_val) ||
+ (cabs(val) == cabs(m_val) &&
+ idx > m_idx)) {
+ m_val = val;
+ m_idx = idx;
}
- };
+ }
+};
- template<typename T>
- struct MinMaxOp<af_max_t, T>
+template<typename T>
+struct MinMaxOp<af_max_t, T>
+{
+ T m_val;
+ uint m_idx;
+ MinMaxOp(T val, uint idx) :
+ m_val(val), m_idx(idx)
{
- T m_val;
- uint m_idx;
- MinMaxOp(T val, uint idx) :
- m_val(val), m_idx(idx)
- {
- }
+ }
- void operator()(T val, uint idx)
- {
- if (cabs(val) > cabs(m_val) ||
- (cabs(val) == cabs(m_val) &&
- idx <= m_idx)) {
- m_val = val;
- m_idx = idx;
- }
+ void operator()(T val, uint idx)
+ {
+ if (cabs(val) > cabs(m_val) ||
+ (cabs(val) == cabs(m_val) &&
+ idx <= m_idx)) {
+ m_val = val;
+ m_idx = idx;
}
- };
+ }
+};
- template<af_op_t op, typename T, int D>
- struct ireduce_dim
+template<af_op_t op, typename T, int D>
+struct ireduce_dim
+{
+ void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+ const Array<T> input, const dim_t inOffset, const int dim)
{
- void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
- const Array<T> input, const dim_t inOffset, const int dim)
- {
- const dim4 odims = output.dims();
- const dim4 ostrides = output.strides();
- const dim4 istrides = input.strides();
- const int D1 = D - 1;
- for (dim_t i = 0; i < odims[D1]; i++) {
- ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
- input, inOffset + i * istrides[D1], dim);
- }
+ const dim4 odims = output.dims();
+ const dim4 ostrides = output.strides();
+ const dim4 istrides = input.strides();
+ const int D1 = D - 1;
+ for (dim_t i = 0; i < odims[D1]; i++) {
+ ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
+ input, inOffset + i * istrides[D1], dim);
}
- };
+ }
+};
- template<af_op_t op, typename T>
- struct ireduce_dim<op, T, 0>
+template<af_op_t op, typename T>
+struct ireduce_dim<op, T, 0>
+{
+ void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+ const Array<T> input, const dim_t inOffset, const int dim)
{
- void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
- const Array<T> input, const dim_t inOffset, const int dim)
- {
- const dim4 idims = input.dims();
- const dim4 istrides = input.strides();
-
- T const * const in = input.get();
- T * out = output.get();
- uint * loc = locArray.get();
-
- dim_t stride = istrides[dim];
- MinMaxOp<op, T> Op(in[0], 0);
- for (dim_t i = 0; i < idims[dim]; i++) {
- Op(in[inOffset + i * stride], i);
- }
+ const dim4 idims = input.dims();
+ const dim4 istrides = input.strides();
- *(out+outOffset) = Op.m_val;
- *(loc+outOffset) = Op.m_idx;
- }
- };
+ T const * const in = input.get();
+ T * out = output.get();
+ uint * loc = locArray.get();
- template<af_op_t op, typename T>
- using ireduce_dim_func = std::function<void(Array<T>, Array<uint>, const dim_t,
- const Array<T>, const dim_t, const int)>;
+ dim_t stride = istrides[dim];
+ MinMaxOp<op, T> Op(in[0], 0);
+ for (dim_t i = 0; i < idims[dim]; i++) {
+ Op(in[inOffset + i * stride], i);
+ }
- template<af_op_t op, typename T>
- void ireduce(Array<T> &out, Array<uint> &loc, const Array<T> &in, const int dim)
- {
- out.eval();
- loc.eval();
- in.eval();
-
- dim4 odims = in.dims();
- odims[dim] = 1;
- static const ireduce_dim_func<op, T> ireduce_funcs[] = { ireduce_dim<op, T, 1>()
- , ireduce_dim<op, T, 2>()
- , ireduce_dim<op, T, 3>()
- , ireduce_dim<op, T, 4>()};
-
- getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim);
+ *(out+outOffset) = Op.m_val;
+ *(loc+outOffset) = Op.m_idx;
}
+};
- template<af_op_t op, typename T>
- T ireduce_all(unsigned *loc, const Array<T> &in)
- {
- in.eval();
- getQueue().sync();
+template<af_op_t op, typename T>
+using ireduce_dim_func = std::function<void(Array<T>, Array<uint>, const dim_t,
+ const Array<T>, const dim_t, const int)>;
- af::dim4 dims = in.dims();
- af::dim4 strides = in.strides();
- const T *inPtr = in.get();
+template<af_op_t op, typename T>
+void ireduce(Array<T> &out, Array<uint> &loc, const Array<T> &in, const int dim)
+{
+ out.eval();
+ loc.eval();
+ in.eval();
+
+ dim4 odims = in.dims();
+ odims[dim] = 1;
+ static const ireduce_dim_func<op, T> ireduce_funcs[] = { ireduce_dim<op, T, 1>()
+ , ireduce_dim<op, T, 2>()
+ , ireduce_dim<op, T, 3>()
+ , ireduce_dim<op, T, 4>()};
+
+ getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim);
+}
- MinMaxOp<op, T> Op(inPtr[0], 0);
+template<af_op_t op, typename T>
+T ireduce_all(unsigned *loc, const Array<T> &in)
+{
+ in.eval();
+ getQueue().sync();
+
+ af::dim4 dims = in.dims();
+ af::dim4 strides = in.strides();
+ const T *inPtr = in.get();
- for(dim_t l = 0; l < dims[3]; l++) {
- dim_t off3 = l * strides[3];
+ MinMaxOp<op, T> Op(inPtr[0], 0);
- for(dim_t k = 0; k < dims[2]; k++) {
- dim_t off2 = k * strides[2];
+ for(dim_t l = 0; l < dims[3]; l++) {
+ dim_t off3 = l * strides[3];
- for(dim_t j = 0; j < dims[1]; j++) {
- dim_t off1 = j * strides[1];
+ for(dim_t k = 0; k < dims[2]; k++) {
+ dim_t off2 = k * strides[2];
- for(dim_t i = 0; i < dims[0]; i++) {
- dim_t idx = i + off1 + off2 + off3;
- Op(inPtr[idx], idx);
- }
+ for(dim_t j = 0; j < dims[1]; j++) {
+ dim_t off1 = j * strides[1];
+
+ for(dim_t i = 0; i < dims[0]; i++) {
+ dim_t idx = i + off1 + off2 + off3;
+ Op(inPtr[idx], idx);
}
}
}
-
- *loc = Op.m_idx;
- return Op.m_val;
}
+ *loc = Op.m_idx;
+ return Op.m_val;
+}
+
#define INSTANTIATE(ROp, T) \
template void ireduce<ROp, T>(Array<T> &out, Array<uint> &loc, \
const Array<T> &in, const int dim); \
template T ireduce_all<ROp, T>(unsigned *loc, const Array<T> &in); \
- //min
- INSTANTIATE(af_min_t, float )
- INSTANTIATE(af_min_t, double )
- INSTANTIATE(af_min_t, cfloat )
- INSTANTIATE(af_min_t, cdouble)
- INSTANTIATE(af_min_t, int )
- INSTANTIATE(af_min_t, uint )
- INSTANTIATE(af_min_t, intl )
- INSTANTIATE(af_min_t, uintl )
- INSTANTIATE(af_min_t, char )
- INSTANTIATE(af_min_t, uchar )
- INSTANTIATE(af_min_t, short )
- INSTANTIATE(af_min_t, ushort )
-
- //max
- INSTANTIATE(af_max_t, float )
- INSTANTIATE(af_max_t, double )
- INSTANTIATE(af_max_t, cfloat )
- INSTANTIATE(af_max_t, cdouble)
- INSTANTIATE(af_max_t, int )
- INSTANTIATE(af_max_t, uint )
- INSTANTIATE(af_max_t, intl )
- INSTANTIATE(af_max_t, uintl )
- INSTANTIATE(af_max_t, char )
- INSTANTIATE(af_max_t, uchar )
- INSTANTIATE(af_max_t, short )
- INSTANTIATE(af_max_t, ushort )
+//min
+INSTANTIATE(af_min_t, float )
+INSTANTIATE(af_min_t, double )
+INSTANTIATE(af_min_t, cfloat )
+INSTANTIATE(af_min_t, cdouble)
+INSTANTIATE(af_min_t, int )
+INSTANTIATE(af_min_t, uint )
+INSTANTIATE(af_min_t, intl )
+INSTANTIATE(af_min_t, uintl )
+INSTANTIATE(af_min_t, char )
+INSTANTIATE(af_min_t, uchar )
+INSTANTIATE(af_min_t, short )
+INSTANTIATE(af_min_t, ushort )
+
+//max
+INSTANTIATE(af_max_t, float )
+INSTANTIATE(af_max_t, double )
+INSTANTIATE(af_max_t, cfloat )
+INSTANTIATE(af_max_t, cdouble)
+INSTANTIATE(af_max_t, int )
+INSTANTIATE(af_max_t, uint )
+INSTANTIATE(af_max_t, intl )
+INSTANTIATE(af_max_t, uintl )
+INSTANTIATE(af_max_t, char )
+INSTANTIATE(af_max_t, uchar )
+INSTANTIATE(af_max_t, short )
+INSTANTIATE(af_max_t, ushort )
+
}
diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp
index ff0be43..9a04613 100644
--- a/src/backend/cpu/lu.cpp
+++ b/src/backend/cpu/lu.cpp
@@ -104,6 +104,8 @@ void convertPivot(Array<int> p, Array<int> pivot)
template<typename T>
void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
{
+ in.eval();
+
dim4 iDims = in.dims();
int M = iDims[0];
int N = iDims[1];
@@ -123,6 +125,8 @@ void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in)
template<typename T>
Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
{
+ in.eval();
+
dim4 iDims = in.dims();
Array<int> pivot = createEmptyArray<int>(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1));
@@ -166,6 +170,7 @@ Array<int> lu_inplace(Array<T> &in, const bool convert_pivot)
namespace cpu
{
+
#define INSTANTIATE_LU(T) \
template Array<int> lu_inplace<T>(Array<T> &in, const bool convert_pivot); \
template void lu<T>(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in);
@@ -174,4 +179,5 @@ INSTANTIATE_LU(float)
INSTANTIATE_LU(cfloat)
INSTANTIATE_LU(double)
INSTANTIATE_LU(cdouble)
+
}
diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp
index 02a4888..d4ce95a 100644
--- a/src/backend/cpu/match_template.cpp
+++ b/src/backend/cpu/match_template.cpp
@@ -24,6 +24,9 @@ namespace cpu
template<typename inType, typename outType, af_match_type mType>
Array<outType> match_template(const Array<inType> &sImg, const Array<inType> &tImg)
{
+ sImg.eval();
+ tImg.eval();
+
Array<outType> out = createEmptyArray<outType>(sImg.dims());
auto func = [=](Array<outType> out, const Array<inType> sImg, const Array<inType> tImg) {
diff --git a/src/backend/cpu/math.cpp b/src/backend/cpu/math.cpp
index 5a6bcbc..e00fd78 100644
--- a/src/backend/cpu/math.cpp
+++ b/src/backend/cpu/math.cpp
@@ -11,39 +11,41 @@
namespace cpu
{
- uint abs(uint val) { return val; }
- uchar abs(uchar val) { return val; }
- uintl abs(uintl val) { return val; }
-
- cfloat scalar(float val)
- {
- cfloat cval = {(float)val, 0};
- return cval;
- }
-
- cdouble scalar(double val)
- {
- cdouble cval = {val, 0};
- return cval;
- }
-
- cfloat min(cfloat lhs, cfloat rhs)
- {
- return abs(lhs) < abs(rhs) ? lhs : rhs;
- }
-
- cdouble min(cdouble lhs, cdouble rhs)
- {
- return abs(lhs) < abs(rhs) ? lhs : rhs;
- }
-
- cfloat max(cfloat lhs, cfloat rhs)
- {
- return abs(lhs) > abs(rhs) ? lhs : rhs;
- }
-
- cdouble max(cdouble lhs, cdouble rhs)
- {
- return abs(lhs) > abs(rhs) ? lhs : rhs;
- }
+
+uint abs(uint val) { return val; }
+uchar abs(uchar val) { return val; }
+uintl abs(uintl val) { return val; }
+
+cfloat scalar(float val)
+{
+ cfloat cval = {(float)val, 0};
+ return cval;
+}
+
+cdouble scalar(double val)
+{
+ cdouble cval = {val, 0};
+ return cval;
+}
+
+cfloat min(cfloat lhs, cfloat rhs)
+{
+ return abs(lhs) < abs(rhs) ? lhs : rhs;
+}
+
+cdouble min(cdouble lhs, cdouble rhs)
+{
+ return abs(lhs) < abs(rhs) ? lhs : rhs;
+}
+
+cfloat max(cfloat lhs, cfloat rhs)
+{
+ return abs(lhs) > abs(rhs) ? lhs : rhs;
+}
+
+cdouble max(cdouble lhs, cdouble rhs)
+{
+ return abs(lhs) > abs(rhs) ? lhs : rhs;
+}
+
}
diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp
index 3f99d15..62b80e0 100644
--- a/src/backend/cpu/meanshift.cpp
+++ b/src/backend/cpu/meanshift.cpp
@@ -33,6 +33,8 @@ inline dim_t clamp(dim_t a, dim_t mn, dim_t mx)
template<typename T, bool is_color>
Array<T> meanshift(const Array<T> &in, const float &s_sigma, const float &c_sigma, const unsigned iter)
{
+ in.eval();
+
Array<T> out = createEmptyArray<T>(in.dims());
auto func = [=] (Array<T> out, const Array<T> in, const float s_sigma,
diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp
index ce921fc..4e74a55 100644
--- a/src/backend/cpu/medfilt.cpp
+++ b/src/backend/cpu/medfilt.cpp
@@ -25,6 +25,8 @@ namespace cpu
template<typename T, af_border_type pad>
Array<T> medfilt(const Array<T> &in, dim_t w_len, dim_t w_wid)
{
+ in.eval();
+
Array<T> out = createEmptyArray<T>(in.dims());
auto func = [=] (Array<T> out, const Array<T> in,
diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp
index 73120b9..e11f994 100644
--- a/src/backend/cpu/memory.cpp
+++ b/src/backend/cpu/memory.cpp
@@ -20,211 +20,211 @@
namespace cpu
{
- static size_t memory_resolution = 1024; //1KB
+static size_t memory_resolution = 1024; //1KB
- void setMemStepSize(size_t step_bytes)
- {
- memory_resolution = step_bytes;
- }
+void setMemStepSize(size_t step_bytes)
+{
+ memory_resolution = step_bytes;
+}
+
+size_t getMemStepSize(void)
+{
+ return memory_resolution;
+}
- size_t getMemStepSize(void)
+class Manager
+{
+ public:
+ static bool initialized;
+ Manager()
{
- return memory_resolution;
+ initialized = true;
}
- class Manager
+ ~Manager()
{
- public:
- static bool initialized;
- Manager()
- {
- initialized = true;
- }
-
- ~Manager()
- {
- garbageCollect();
- }
- };
+ garbageCollect();
+ }
+};
- bool Manager::initialized = false;
+bool Manager::initialized = false;
- static void managerInit()
- {
- if(Manager::initialized == false)
- static Manager pm = Manager();
- }
+static void managerInit()
+{
+ if(Manager::initialized == false)
+ static Manager pm = Manager();
+}
- typedef struct
- {
- bool is_free;
- bool is_unlinked;
- size_t bytes;
- } mem_info;
-
- static size_t used_bytes = 0;
- static size_t used_buffers = 0;
- static size_t total_bytes = 0;
- typedef std::map<void *, mem_info> mem_t;
- typedef mem_t::iterator mem_iter;
-
- mem_t memory_map;
- std::mutex memory_map_mutex;
-
- template<typename T>
- void freeWrapper(T *ptr)
- {
- free((void *)ptr);
- }
+typedef struct
+{
+ bool is_free;
+ bool is_unlinked;
+ size_t bytes;
+} mem_info;
+
+static size_t used_bytes = 0;
+static size_t used_buffers = 0;
+static size_t total_bytes = 0;
+typedef std::map<void *, mem_info> mem_t;
+typedef mem_t::iterator mem_iter;
+
+mem_t memory_map;
+std::mutex memory_map_mutex;
+
+template<typename T>
+void freeWrapper(T *ptr)
+{
+ free((void *)ptr);
+}
- void garbageCollect()
- {
- for(mem_iter iter = memory_map.begin();
- iter != memory_map.end(); ++iter) {
+void garbageCollect()
+{
+ for(mem_iter iter = memory_map.begin();
+ iter != memory_map.end(); ++iter) {
- if ((iter->second).is_free) {
+ if ((iter->second).is_free) {
- if (!(iter->second).is_unlinked) {
- freeWrapper(iter->first);
- total_bytes -= iter->second.bytes;
- }
+ if (!(iter->second).is_unlinked) {
+ freeWrapper(iter->first);
+ total_bytes -= iter->second.bytes;
}
}
+ }
- mem_iter memory_curr = memory_map.begin();
- mem_iter memory_end = memory_map.end();
+ mem_iter memory_curr = memory_map.begin();
+ mem_iter memory_end = memory_map.end();
- while(memory_curr != memory_end) {
- if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) {
- memory_map.erase(memory_curr++);
- } else {
- ++memory_curr;
- }
+ while(memory_curr != memory_end) {
+ if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) {
+ memory_map.erase(memory_curr++);
+ } else {
+ ++memory_curr;
}
}
+}
- template<typename T>
- T* memAlloc(const size_t &elements)
- {
- managerInit();
+template<typename T>
+T* memAlloc(const size_t &elements)
+{
+ managerInit();
- T* ptr = NULL;
- size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution;
+ T* ptr = NULL;
+ size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution;
- if (elements > 0) {
- std::lock_guard<std::mutex> lock(memory_map_mutex);
+ if (elements > 0) {
+ std::lock_guard<std::mutex> lock(memory_map_mutex);
- // FIXME: Add better checks for garbage collection
- // Perhaps look at total memory available as a metric
- if (memory_map.size() > MAX_BUFFERS ||
- used_bytes >= MAX_BYTES) {
+ // FIXME: Add better checks for garbage collection
+ // Perhaps look at total memory available as a metric
+ if (memory_map.size() > MAX_BUFFERS ||
+ used_bytes >= MAX_BYTES) {
- garbageCollect();
- }
+ garbageCollect();
+ }
- for(mem_iter iter = memory_map.begin();
- iter != memory_map.end(); ++iter) {
+ for(mem_iter iter = memory_map.begin();
+ iter != memory_map.end(); ++iter) {
- mem_info info = iter->second;
+ mem_info info = iter->second;
- if ( info.is_free &&
- !info.is_unlinked &&
- info.bytes == alloc_bytes) {
+ if ( info.is_free &&
+ !info.is_unlinked &&
+ info.bytes == alloc_bytes) {
- iter->second.is_free = false;
- used_bytes += alloc_bytes;
- used_buffers++;
- return (T *)iter->first;
- }
+ iter->second.is_free = false;
+ used_bytes += alloc_bytes;
+ used_buffers++;
+ return (T *)iter->first;
}
+ }
- // Perform garbage collection if memory can not be allocated
- ptr = (T *)malloc(alloc_bytes);
+ // Perform garbage collection if memory can not be allocated
+ ptr = (T *)malloc(alloc_bytes);
- if (ptr == NULL) {
- AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM);
- }
+ if (ptr == NULL) {
+ AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM);
+ }
- mem_info info = {false, false, alloc_bytes};
- memory_map[ptr] = info;
+ mem_info info = {false, false, alloc_bytes};
+ memory_map[ptr] = info;
- used_bytes += alloc_bytes;
- used_buffers++;
- total_bytes += alloc_bytes;
- }
- return ptr;
+ used_bytes += alloc_bytes;
+ used_buffers++;
+ total_bytes += alloc_bytes;
}
+ return ptr;
+}
- template<typename T>
- void memFree(T *ptr)
- {
- std::lock_guard<std::mutex> lock(memory_map_mutex);
+template<typename T>
+void memFree(T *ptr)
+{
+ std::lock_guard<std::mutex> lock(memory_map_mutex);
- mem_iter iter = memory_map.find((void *)ptr);
+ mem_iter iter = memory_map.find((void *)ptr);
- if (iter != memory_map.end()) {
+ if (iter != memory_map.end()) {
- iter->second.is_free = true;
- if ((iter->second).is_unlinked) return;
+ iter->second.is_free = true;
+ if ((iter->second).is_unlinked) return;
- used_bytes -= iter->second.bytes;
- used_buffers--;
+ used_bytes -= iter->second.bytes;
+ used_buffers--;
- } else {
- freeWrapper(ptr); // Free it because we are not sure what the size is
- }
+ } else {
+ freeWrapper(ptr); // Free it because we are not sure what the size is
}
+}
- template<typename T>
- void memPop(const T *ptr)
- {
- std::lock_guard<std::mutex> lock(memory_map_mutex);
+template<typename T>
+void memPop(const T *ptr)
+{
+ std::lock_guard<std::mutex> lock(memory_map_mutex);
- mem_iter iter = memory_map.find((void *)ptr);
+ mem_iter iter = memory_map.find((void *)ptr);
- if (iter != memory_map.end()) {
- iter->second.is_unlinked = true;
- } else {
- mem_info info = { false,
- true,
- 100 }; //This number is not relevant
+ if (iter != memory_map.end()) {
+ iter->second.is_unlinked = true;
+ } else {
+ mem_info info = { false,
+ true,
+ 100 }; //This number is not relevant
- memory_map[(void *)ptr] = info;
- }
+ memory_map[(void *)ptr] = info;
}
+}
- template<typename T>
- void memPush(const T *ptr)
- {
- std::lock_guard<std::mutex> lock(memory_map_mutex);
- mem_iter iter = memory_map.find((void *)ptr);
- if (iter != memory_map.end()) {
- iter->second.is_unlinked = false;
- }
+template<typename T>
+void memPush(const T *ptr)
+{
+ std::lock_guard<std::mutex> lock(memory_map_mutex);
+ mem_iter iter = memory_map.find((void *)ptr);
+ if (iter != memory_map.end()) {
+ iter->second.is_unlinked = false;
}
+}
- void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
- size_t *lock_bytes, size_t *lock_buffers)
- {
- getQueue().sync();
- if (alloc_bytes ) *alloc_bytes = total_bytes;
- if (alloc_buffers ) *alloc_buffers = memory_map.size();
- if (lock_bytes ) *lock_bytes = used_bytes;
- if (lock_buffers ) *lock_buffers = used_buffers;
- }
+void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers,
+ size_t *lock_bytes, size_t *lock_buffers)
+{
+ getQueue().sync();
+ if (alloc_bytes ) *alloc_bytes = total_bytes;
+ if (alloc_buffers ) *alloc_buffers = memory_map.size();
+ if (lock_bytes ) *lock_bytes = used_bytes;
+ if (lock_buffers ) *lock_buffers = used_buffers;
+}
- template<typename T>
- T* pinnedAlloc(const size_t &elements)
- {
- return memAlloc<T>(elements);
- }
+template<typename T>
+T* pinnedAlloc(const size_t &elements)
+{
+ return memAlloc<T>(elements);
+}
- template<typename T>
- void pinnedFree(T* ptr)
- {
- memFree<T>(ptr);
- }
+template<typename T>
+void pinnedFree(T* ptr)
+{
+ memFree<T>(ptr);
+}
#define INSTANTIATE(T) \
template T* memAlloc(const size_t &elements); \
@@ -234,16 +234,17 @@ namespace cpu
template T* pinnedAlloc(const size_t &elements); \
template void pinnedFree(T* ptr); \
- INSTANTIATE(float)
- INSTANTIATE(cfloat)
- INSTANTIATE(double)
- INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(char)
- INSTANTIATE(uchar)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
- INSTANTIATE(ushort)
- INSTANTIATE(short )
+INSTANTIATE(float)
+INSTANTIATE(cfloat)
+INSTANTIATE(double)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(ushort)
+INSTANTIATE(short )
+
}
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
index 97f0e0a..b6f50c2 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -163,8 +163,6 @@ void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
idx = createEmptyArray<uint>(outDims);
dist = createEmptyArray<To >(outDims);
- idx.eval();
- dist.eval();
switch(dist_type) {
case AF_SAD:
diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp
index b5f1806..78631fc 100644
--- a/src/backend/cpu/qr.cpp
+++ b/src/backend/cpu/qr.cpp
@@ -59,6 +59,8 @@ GQR_FUNC(gqr , cdouble, zungqr)
template<typename T>
void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
{
+ in.eval();
+
dim4 iDims = in.dims();
int M = iDims[0];
int N = iDims[1];
@@ -83,6 +85,8 @@ void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in)
template<typename T>
Array<T> qr_inplace(Array<T> &in)
{
+ in.eval();
+
dim4 iDims = in.dims();
int M = iDims[0];
int N = iDims[1];
@@ -121,6 +125,7 @@ Array<T> qr_inplace(Array<T> &in)
namespace cpu
{
+
#define INSTANTIATE_QR(T) \
template Array<T> qr_inplace<T>(Array<T> &in); \
template void qr<T>(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in);
@@ -129,4 +134,5 @@ INSTANTIATE_QR(float)
INSTANTIATE_QR(cfloat)
INSTANTIATE_QR(double)
INSTANTIATE_QR(cdouble)
+
}
diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp
index 1fa46b2..7837db5 100644
--- a/src/backend/cpu/range.cpp
+++ b/src/backend/cpu/range.cpp
@@ -19,6 +19,7 @@
namespace cpu
{
+
///////////////////////////////////////////////////////////////////////////
// Kernel Functions
///////////////////////////////////////////////////////////////////////////
@@ -90,4 +91,5 @@ INSTANTIATE(uintl)
INSTANTIATE(uchar)
INSTANTIATE(ushort)
INSTANTIATE(short)
+
}
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp
index afe5620..1ad7dad 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/reorder.cpp
@@ -16,70 +16,70 @@
namespace cpu
{
- template<typename T>
- void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
- {
- T* outPtr = out.get();
- const T* inPtr = in.get();
- const af::dim4 ist = in.strides();
- const af::dim4 ost = out.strides();
+template<typename T>
+void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
+{
+ T* outPtr = out.get();
+ const T* inPtr = in.get();
+
+ const af::dim4 ist = in.strides();
+ const af::dim4 ost = out.strides();
- dim_t ids[4] = {0};
- for(dim_t ow = 0; ow < oDims[3]; ow++) {
- const dim_t oW = ow * ost[3];
- ids[rdims[3]] = ow;
- for(dim_t oz = 0; oz < oDims[2]; oz++) {
- const dim_t oZW = oW + oz * ost[2];
- ids[rdims[2]] = oz;
- for(dim_t oy = 0; oy < oDims[1]; oy++) {
- const dim_t oYZW = oZW + oy * ost[1];
- ids[rdims[1]] = oy;
- for(dim_t ox = 0; ox < oDims[0]; ox++) {
- const dim_t oIdx = oYZW + ox;
+ dim_t ids[4] = {0};
+ for(dim_t ow = 0; ow < oDims[3]; ow++) {
+ const dim_t oW = ow * ost[3];
+ ids[rdims[3]] = ow;
+ for(dim_t oz = 0; oz < oDims[2]; oz++) {
+ const dim_t oZW = oW + oz * ost[2];
+ ids[rdims[2]] = oz;
+ for(dim_t oy = 0; oy < oDims[1]; oy++) {
+ const dim_t oYZW = oZW + oy * ost[1];
+ ids[rdims[1]] = oy;
+ for(dim_t ox = 0; ox < oDims[0]; ox++) {
+ const dim_t oIdx = oYZW + ox;
- ids[rdims[0]] = ox;
- const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
- ids[1] * ist[1] + ids[0];
+ ids[rdims[0]] = ox;
+ const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
+ ids[1] * ist[1] + ids[0];
- outPtr[oIdx] = inPtr[iIdx];
- }
+ outPtr[oIdx] = inPtr[iIdx];
}
}
}
}
+}
- template<typename T>
- Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
- {
- in.eval();
+template<typename T>
+Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
+{
+ in.eval();
- const af::dim4 iDims = in.dims();
- af::dim4 oDims(0);
- for(int i = 0; i < 4; i++)
- oDims[i] = iDims[rdims[i]];
+ const af::dim4 iDims = in.dims();
+ af::dim4 oDims(0);
+ for(int i = 0; i < 4; i++)
+ oDims[i] = iDims[rdims[i]];
- Array<T> out = createEmptyArray<T>(oDims);
- getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
- return out;
- }
+ Array<T> out = createEmptyArray<T>(oDims);
+ getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
+ return out;
+}
#define INSTANTIATE(T) \
template Array<T> reorder<T>(const Array<T> &in, const af::dim4 &rdims); \
- INSTANTIATE(float)
- INSTANTIATE(double)
- INSTANTIATE(cfloat)
- INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(uchar)
- INSTANTIATE(char)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
- INSTANTIATE(short)
- INSTANTIATE(ushort)
-
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
}
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp
index 160ed46..8fb2edc 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/resize.cpp
@@ -19,6 +19,7 @@
namespace cpu
{
+
/**
* noop function for round to avoid compilation
* issues due to lack of this function in C90 based
@@ -215,4 +216,5 @@ INSTANTIATE(uchar)
INSTANTIATE(char)
INSTANTIATE(short)
INSTANTIATE(ushort)
+
}
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp
index 01ec962..5687d69 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/rotate.cpp
@@ -18,106 +18,110 @@
namespace cpu
{
- template<typename T, af_interp_type method>
- void rotate_(Array<T> output, const Array<T> input, const float theta)
+
+template<typename T, af_interp_type method>
+void rotate_(Array<T> output, const Array<T> input, const float theta)
+{
+ const af::dim4 odims = output.dims();
+ const af::dim4 idims = input.dims();
+ const af::dim4 ostrides = output.strides();
+ const af::dim4 istrides = input.strides();
+
+ const T* in = input.get();
+ T* out = output.get();
+ dim_t nimages = idims[2];
+
+ void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
+ const af::dim4 &, const af::dim4 &,
+ const dim_t, const dim_t, const dim_t, const dim_t);
+
+ const float c = cos(-theta), s = sin(-theta);
+ float tx, ty;
{
- const af::dim4 odims = output.dims();
- const af::dim4 idims = input.dims();
- const af::dim4 ostrides = output.strides();
- const af::dim4 istrides = input.strides();
-
- const T* in = input.get();
- T* out = output.get();
- dim_t nimages = idims[2];
-
- void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
- const af::dim4 &, const af::dim4 &,
- const dim_t, const dim_t, const dim_t, const dim_t);
-
- const float c = cos(-theta), s = sin(-theta);
- float tx, ty;
- {
- const float nx = 0.5 * (idims[0] - 1);
- const float ny = 0.5 * (idims[1] - 1);
- const float mx = 0.5 * (odims[0] - 1);
- const float my = 0.5 * (odims[1] - 1);
- const float sx = (mx * c + my *-s);
- const float sy = (mx * s + my * c);
- tx = -(sx - nx);
- ty = -(sy - ny);
- }
+ const float nx = 0.5 * (idims[0] - 1);
+ const float ny = 0.5 * (idims[1] - 1);
+ const float mx = 0.5 * (odims[0] - 1);
+ const float my = 0.5 * (odims[1] - 1);
+ const float sx = (mx * c + my *-s);
+ const float sy = (mx * s + my * c);
+ tx = -(sx - nx);
+ ty = -(sy - ny);
+ }
- const float tmat[6] = {std::round( c * 1000) / 1000.0f,
- std::round(-s * 1000) / 1000.0f,
- std::round(tx * 1000) / 1000.0f,
- std::round( s * 1000) / 1000.0f,
- std::round( c * 1000) / 1000.0f,
- std::round(ty * 1000) / 1000.0f,
- };
-
- switch(method) {
- case AF_INTERP_NEAREST:
- t_fn = &transform_n;
- break;
- case AF_INTERP_BILINEAR:
- t_fn = &transform_b;
- break;
- case AF_INTERP_LOWER:
- t_fn = &transform_l;
- break;
- default:
- AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
- break;
- }
+ const float tmat[6] = {std::round( c * 1000) / 1000.0f,
+ std::round(-s * 1000) / 1000.0f,
+ std::round(tx * 1000) / 1000.0f,
+ std::round( s * 1000) / 1000.0f,
+ std::round( c * 1000) / 1000.0f,
+ std::round(ty * 1000) / 1000.0f,
+ };
+
+ switch(method) {
+ case AF_INTERP_NEAREST:
+ t_fn = &transform_n;
+ break;
+ case AF_INTERP_BILINEAR:
+ t_fn = &transform_b;
+ break;
+ case AF_INTERP_LOWER:
+ t_fn = &transform_l;
+ break;
+ default:
+ AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+ break;
+ }
- // Do transform for image
- for(int yy = 0; yy < (int)odims[1]; yy++) {
- for(int xx = 0; xx < (int)odims[0]; xx++) {
- t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy);
- }
+ // Do transform for image
+ for(int yy = 0; yy < (int)odims[1]; yy++) {
+ for(int xx = 0; xx < (int)odims[0]; xx++) {
+ t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy);
}
}
+}
- template<typename T>
- Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
- const af_interp_type method)
- {
- Array<T> out = createEmptyArray<T>(odims);
-
- switch(method) {
- case AF_INTERP_NEAREST:
- getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
- break;
- case AF_INTERP_BILINEAR:
- getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
- break;
- case AF_INTERP_LOWER:
- getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
- break;
- default:
- AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
- break;
- }
+template<typename T>
+Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
+ const af_interp_type method)
+{
+ in.eval();
- return out;
+ Array<T> out = createEmptyArray<T>(odims);
+
+ switch(method) {
+ case AF_INTERP_NEAREST:
+ getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
+ break;
+ case AF_INTERP_BILINEAR:
+ getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
+ break;
+ case AF_INTERP_LOWER:
+ getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
+ break;
+ default:
+ AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
+ break;
}
+ return out;
+}
+
#define INSTANTIATE(T) \
template Array<T> rotate(const Array<T> &in, const float theta, \
const af::dim4 &odims, const af_interp_type method);
- INSTANTIATE(float)
- INSTANTIATE(double)
- INSTANTIATE(cfloat)
- INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
- INSTANTIATE(uchar)
- INSTANTIATE(char)
- INSTANTIATE(short)
- INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(uchar)
+INSTANTIATE(char)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+
}
diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp
index 67aa586..d6321bb 100644
--- a/src/backend/cpu/set.cpp
+++ b/src/backend/cpu/set.cpp
@@ -23,116 +23,118 @@
namespace cpu
{
- using namespace std;
- using af::dim4;
-
- template<typename T>
- Array<T> setUnique(const Array<T> &in,
- const bool is_sorted)
- {
- in.eval();
-
- Array<T> out = createEmptyArray<T>(af::dim4());
- if (is_sorted) out = copyArray<T>(in);
- else out = sort<T, 1>(in, 0);
-
- // Need to sync old jobs since we need to
- // operator on pointers directly in std::unique
- getQueue().sync();
-
- T *ptr = out.get();
- T *last = std::unique(ptr, ptr + in.elements());
- dim_t dist = (dim_t)std::distance(ptr, last);
-
- dim4 dims(dist, 1, 1, 1);
- out.resetDims(dims);
- return out;
- }
- template<typename T>
- Array<T> setUnion(const Array<T> &first,
- const Array<T> &second,
- const bool is_unique)
- {
- first.eval();
- second.eval();
- getQueue().sync();
+using namespace std;
+using af::dim4;
- Array<T> uFirst = first;
- Array<T> uSecond = second;
+template<typename T>
+Array<T> setUnique(const Array<T> &in,
+ const bool is_sorted)
+{
+ in.eval();
- if (!is_unique) {
- // FIXME: Perhaps copy + unique would do ?
- uFirst = setUnique(first, false);
- uSecond = setUnique(second, false);
- }
+ Array<T> out = createEmptyArray<T>(af::dim4());
+ if (is_sorted) out = copyArray<T>(in);
+ else out = sort<T, 1>(in, 0);
- dim_t first_elements = uFirst.elements();
- dim_t second_elements = uSecond.elements();
- dim_t elements = first_elements + second_elements;
+ // Need to sync old jobs since we need to
+ // operator on pointers directly in std::unique
+ getQueue().sync();
- Array<T> out = createEmptyArray<T>(af::dim4(elements));
+ T *ptr = out.get();
+ T *last = std::unique(ptr, ptr + in.elements());
+ dim_t dist = (dim_t)std::distance(ptr, last);
- T *ptr = out.get();
- T *last = std::set_union(uFirst.get() , uFirst.get() + first_elements,
- uSecond.get(), uSecond.get() + second_elements,
- ptr);
+ dim4 dims(dist, 1, 1, 1);
+ out.resetDims(dims);
+ return out;
+}
- dim_t dist = (dim_t)std::distance(ptr, last);
- dim4 dims(dist, 1, 1, 1);
- out.resetDims(dims);
+template<typename T>
+Array<T> setUnion(const Array<T> &first,
+ const Array<T> &second,
+ const bool is_unique)
+{
+ first.eval();
+ second.eval();
+ getQueue().sync();
- return out;
+ Array<T> uFirst = first;
+ Array<T> uSecond = second;
+
+ if (!is_unique) {
+ // FIXME: Perhaps copy + unique would do ?
+ uFirst = setUnique(first, false);
+ uSecond = setUnique(second, false);
}
- template<typename T>
- Array<T> setIntersect(const Array<T> &first,
- const Array<T> &second,
- const bool is_unique)
- {
- first.eval();
- second.eval();
- getQueue().sync();
+ dim_t first_elements = uFirst.elements();
+ dim_t second_elements = uSecond.elements();
+ dim_t elements = first_elements + second_elements;
- Array<T> uFirst = first;
- Array<T> uSecond = second;
+ Array<T> out = createEmptyArray<T>(af::dim4(elements));
- if (!is_unique) {
- uFirst = setUnique(first, false);
- uSecond = setUnique(second, false);
- }
+ T *ptr = out.get();
+ T *last = std::set_union(uFirst.get() , uFirst.get() + first_elements,
+ uSecond.get(), uSecond.get() + second_elements,
+ ptr);
- dim_t first_elements = uFirst.elements();
- dim_t second_elements = uSecond.elements();
- dim_t elements = std::max(first_elements, second_elements);
+ dim_t dist = (dim_t)std::distance(ptr, last);
+ dim4 dims(dist, 1, 1, 1);
+ out.resetDims(dims);
- Array<T> out = createEmptyArray<T>(af::dim4(elements));
+ return out;
+}
- T *ptr = out.get();
- T *last = std::set_intersection(uFirst.get() , uFirst.get() + first_elements,
- uSecond.get(), uSecond.get() + second_elements,
- ptr);
+template<typename T>
+Array<T> setIntersect(const Array<T> &first,
+ const Array<T> &second,
+ const bool is_unique)
+{
+ first.eval();
+ second.eval();
+ getQueue().sync();
- dim_t dist = (dim_t)std::distance(ptr, last);
- dim4 dims(dist, 1, 1, 1);
- out.resetDims(dims);
+ Array<T> uFirst = first;
+ Array<T> uSecond = second;
- return out;
+ if (!is_unique) {
+ uFirst = setUnique(first, false);
+ uSecond = setUnique(second, false);
}
+ dim_t first_elements = uFirst.elements();
+ dim_t second_elements = uSecond.elements();
+ dim_t elements = std::max(first_elements, second_elements);
+
+ Array<T> out = createEmptyArray<T>(af::dim4(elements));
+
+ T *ptr = out.get();
+ T *last = std::set_intersection(uFirst.get() , uFirst.get() + first_elements,
+ uSecond.get(), uSecond.get() + second_elements,
+ ptr);
+
+ dim_t dist = (dim_t)std::distance(ptr, last);
+ dim4 dims(dist, 1, 1, 1);
+ out.resetDims(dims);
+
+ return out;
+}
+
#define INSTANTIATE(T) \
template Array<T> setUnique<T>(const Array<T> &in, const bool is_sorted); \
template Array<T> setUnion<T>(const Array<T> &first, const Array<T> &second, const bool is_unique); \
template Array<T> setIntersect<T>(const Array<T> &first, const Array<T> &second, const bool is_unique); \
- INSTANTIATE(float)
- INSTANTIATE(double)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(char)
- INSTANTIATE(uchar)
- INSTANTIATE(short)
- INSTANTIATE(ushort)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
}
diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp
index 6a2b939..766427b 100644
--- a/src/backend/cpu/shift.cpp
+++ b/src/backend/cpu/shift.cpp
@@ -17,6 +17,7 @@
namespace cpu
{
+
static inline dim_t simple_mod(const dim_t i, const dim_t dim)
{
return (i < dim) ? i : (i - dim);
@@ -25,9 +26,9 @@ static inline dim_t simple_mod(const dim_t i, const dim_t dim)
template<typename T>
Array<T> shift(const Array<T> &in, const int sdims[4])
{
- Array<T> out = createEmptyArray<T>(in.dims());
- out.eval();
in.eval();
+
+ Array<T> out = createEmptyArray<T>(in.dims());
const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]);
auto func = [=] (Array<T> out, const Array<T> in, const af::dim4 sdims) {
@@ -91,4 +92,5 @@ INSTANTIATE(uchar)
INSTANTIATE(char)
INSTANTIATE(short)
INSTANTIATE(ushort)
+
}
diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp
index 853f407..c1c92a9 100644
--- a/src/backend/cpu/sift_nonfree.hpp
+++ b/src/backend/cpu/sift_nonfree.hpp
@@ -76,161 +76,161 @@ using af::dim4;
namespace cpu
{
- static const float PI_VAL = 3.14159265358979323846f;
+static const float PI_VAL = 3.14159265358979323846f;
// default width of descriptor histogram array
- static const int DescrWidth = 4;
+static const int DescrWidth = 4;
// default number of bins per histogram in descriptor array
- static const int DescrHistBins = 8;
+static const int DescrHistBins = 8;
// assumed gaussian blur for input image
- static const float InitSigma = 0.5f;
+static const float InitSigma = 0.5f;
// width of border in which to ignore keypoints
- static const int ImgBorder = 5;
+static const int ImgBorder = 5;
// maximum steps of keypoint interpolation before failure
- static const int MaxInterpSteps = 5;
+static const int MaxInterpSteps = 5;
// default number of bins in histogram for orientation assignment
- static const int OriHistBins = 36;
+static const int OriHistBins = 36;
// determines gaussian sigma for orientation assignment
- static const float OriSigFctr = 1.5f;
+static const float OriSigFctr = 1.5f;
// determines the radius of the region used in orientation assignment */
- static const float OriRadius = 3.0f * OriSigFctr;
+static const float OriRadius = 3.0f * OriSigFctr;
// number of passes of orientation histogram smoothing
- static const int SmoothOriPasses = 2;
+static const int SmoothOriPasses = 2;
// orientation magnitude relative to max that results in new feature
- static const float OriPeakRatio = 0.8f;
+static const float OriPeakRatio = 0.8f;
// determines the size of a single descriptor orientation histogram
- static const float DescrSclFctr = 3.f;
+static const float DescrSclFctr = 3.f;
// threshold on magnitude of elements of descriptor vector
- static const float DescrMagThr = 0.2f;
+static const float DescrMagThr = 0.2f;
// factor used to convert floating-point descriptor to unsigned char
- static const float IntDescrFctr = 512.f;
+static const float IntDescrFctr = 512.f;
// Number of GLOH bins in radial direction
- static const unsigned GLOHRadialBins = 3;
+static const unsigned GLOHRadialBins = 3;
// Radiuses of GLOH descriptors
- static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
+static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f};
// Number of GLOH angular bins (excluding the inner-most radial section)
- static const unsigned GLOHAngularBins = 8;
+static const unsigned GLOHAngularBins = 8;
// Number of GLOH bins per histogram in descriptor
- static const unsigned GLOHHistBins = 16;
+static const unsigned GLOHHistBins = 16;
- typedef struct
- {
- float f[4];
- unsigned l;
- } feat_t;
+typedef struct
+{
+ float f[4];
+ unsigned l;
+} feat_t;
- bool feat_cmp(feat_t i, feat_t j)
- {
- for (int k = 0; k < 4; k++)
- if (i.f[k] != j.f[k])
- return (i.f[k] < j.f[k]);
- if (i.l != j.l)
- return (i.l < j.l);
+bool feat_cmp(feat_t i, feat_t j)
+{
+ for (int k = 0; k < 4; k++)
+ if (i.f[k] != j.f[k])
+ return (i.f[k] < j.f[k]);
+ if (i.l != j.l)
+ return (i.l < j.l);
- return true;
- }
+ return true;
+}
- void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
- {
- feat.resize(nfeat);
- for (unsigned i = 0; i < feat.size(); i++) {
- feat[i].f[0] = x[i];
- feat[i].f[1] = y[i];
- feat[i].f[2] = resp[i];
- feat[i].f[3] = size[i];
- feat[i].l = layer[i];
- }
+void array_to_feat(std::vector<feat_t>& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat)
+{
+ feat.resize(nfeat);
+ for (unsigned i = 0; i < feat.size(); i++) {
+ feat[i].f[0] = x[i];
+ feat[i].f[1] = y[i];
+ feat[i].f[2] = resp[i];
+ feat[i].f[3] = size[i];
+ feat[i].l = layer[i];
}
+}
- template<typename T>
- void gaussian1D(T* out, const int dim, double sigma=0.0)
- {
- if(!(sigma>0)) sigma = 0.25*dim;
-
- T sum = (T)0;
- for(int i=0;i<dim;i++)
- {
- int x = i-(dim-1)/2;
- T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
- out[i] = el;
- sum += el;
- }
+template<typename T>
+void gaussian1D(T* out, const int dim, double sigma=0.0)
+{
+ if(!(sigma>0)) sigma = 0.25*dim;
- for(int k=0;k<dim;k++)
- out[k] /= sum;
+ T sum = (T)0;
+ for(int i=0;i<dim;i++)
+ {
+ int x = i-(dim-1)/2;
+ T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
+ out[i] = el;
+ sum += el;
}
- template<typename T>
- Array<T> gauss_filter(float sigma)
- {
- // Using 6-sigma rule
- unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
+ for(int k=0;k<dim;k++)
+ out[k] /= sum;
+}
- Array<T> filter = createEmptyArray<T>(gauss_len);
- gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
+template<typename T>
+Array<T> gauss_filter(float sigma)
+{
+ // Using 6-sigma rule
+ unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u);
- return filter;
- }
+ Array<T> filter = createEmptyArray<T>(gauss_len);
+ gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma);
- template<int N>
- void gaussianElimination(float* A, float* b, float* x)
- {
- // forward elimination
- for (int i = 0; i < N-1; i++) {
- for (int j = i+1; j < N; j++) {
- float s = A[j*N+i] / A[i*N+i];
+ return filter;
+}
- for (int k = i; k < N; k++)
- A[j*N+k] -= s * A[i*N+k];
+template<int N>
+void gaussianElimination(float* A, float* b, float* x)
+{
+ // forward elimination
+ for (int i = 0; i < N-1; i++) {
+ for (int j = i+1; j < N; j++) {
+ float s = A[j*N+i] / A[i*N+i];
- b[j] -= s * b[i];
- }
+ for (int k = i; k < N; k++)
+ A[j*N+k] -= s * A[i*N+k];
+
+ b[j] -= s * b[i];
}
+ }
- for (int i = 0; i < N; i++)
- x[i] = 0;
+ for (int i = 0; i < N; i++)
+ x[i] = 0;
- // backward substitution
- float sum = 0;
- for (int i = 0; i <= N-2; i++) {
- sum = b[i];
- for (int j = i+1; j < N; j++)
- sum -= A[i*N+j] * x[j];
- x[i] = sum / A[i*N+i];
- }
+ // backward substitution
+ float sum = 0;
+ for (int i = 0; i <= N-2; i++) {
+ sum = b[i];
+ for (int j = i+1; j < N; j++)
+ sum -= A[i*N+j] * x[j];
+ x[i] = sum / A[i*N+i];
}
+}
- template<typename T>
- void sub(
- Array<T>& out,
- const Array<T>& in1,
- const Array<T>& in2)
- {
- size_t nel = in1.elements();
- T* out_ptr = out.get();
- const T* in1_ptr = in1.get();
- const T* in2_ptr = in2.get();
+template<typename T>
+void sub(
+ Array<T>& out,
+ const Array<T>& in1,
+ const Array<T>& in2)
+{
+ size_t nel = in1.elements();
+ T* out_ptr = out.get();
+ const T* in1_ptr = in1.get();
+ const T* in2_ptr = in2.get();
- for (size_t i = 0; i < nel; i++) {
- out_ptr[i] = in1_ptr[i] - in2_ptr[i];
- }
+ for (size_t i = 0; i < nel; i++) {
+ out_ptr[i] = in1_ptr[i] - in2_ptr[i];
}
+}
#define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)])
#define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)])
@@ -238,957 +238,958 @@ namespace cpu
// Determines whether a pixel is a scale-space extremum by comparing it to its
// 3x3x3 pixel neighborhood.
- template<typename T>
- void detectExtrema(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- unsigned* counter,
- const Array<T>& prev,
- const Array<T>& center,
- const Array<T>& next,
- const unsigned layer,
- const unsigned max_feat,
- const float threshold)
- {
- const af::dim4 idims = center.dims();
- const T* prev_ptr = prev.get();
- const T* center_ptr = center.get();
- const T* next_ptr = next.get();
-
- for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
- for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
- float p = center_ptr[y*idims[0] + x];
-
- // Find extrema
- if (abs((float)p) > threshold &&
- ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
- p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y, x+1) &&
- p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1) &&
- p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1) &&
- p > PPTR(y, x-1) && p > PPTR(y , x) && p > PPTR(y, x+1) &&
- p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1) &&
- p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1) &&
- p > NPTR(y, x-1) && p > NPTR(y , x) && p > NPTR(y, x+1) &&
- p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
- (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
- p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y, x+1) &&
- p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1) &&
- p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1) &&
- p < PPTR(y, x-1) && p < PPTR(y , x) && p < PPTR(y, x+1) &&
- p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1) &&
- p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1) &&
- p < NPTR(y, x-1) && p < NPTR(y , x) && p < NPTR(y, x+1) &&
- p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
-
- if (*counter < max_feat)
- {
- x_out[*counter] = (float)y;
- y_out[*counter] = (float)x;
- layer_out[*counter] = layer;
- (*counter)++;
- }
+template<typename T>
+void detectExtrema(
+ float* x_out,
+ float* y_out,
+ unsigned* layer_out,
+ unsigned* counter,
+ const Array<T>& prev,
+ const Array<T>& center,
+ const Array<T>& next,
+ const unsigned layer,
+ const unsigned max_feat,
+ const float threshold)
+{
+ const af::dim4 idims = center.dims();
+ const T* prev_ptr = prev.get();
+ const T* center_ptr = center.get();
+ const T* next_ptr = next.get();
+
+ for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) {
+ for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) {
+ float p = center_ptr[y*idims[0] + x];
+
+ // Find extrema
+ if (abs((float)p) > threshold &&
+ ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) &&
+ p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y, x+1) &&
+ p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1) &&
+ p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1) &&
+ p > PPTR(y, x-1) && p > PPTR(y , x) && p > PPTR(y, x+1) &&
+ p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1) &&
+ p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1) &&
+ p > NPTR(y, x-1) && p > NPTR(y , x) && p > NPTR(y, x+1) &&
+ p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) ||
+ (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) &&
+ p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y, x+1) &&
+ p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1) &&
+ p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1) &&
+ p < PPTR(y, x-1) && p < PPTR(y , x) && p < PPTR(y, x+1) &&
+ p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1) &&
+ p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1) &&
+ p < NPTR(y, x-1) && p < NPTR(y , x) && p < NPTR(y, x+1) &&
+ p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) {
+
+ if (*counter < max_feat)
+ {
+ x_out[*counter] = (float)y;
+ y_out[*counter] = (float)x;
+ layer_out[*counter] = layer;
+ (*counter)++;
}
}
}
}
+}
// Interpolates a scale-space extremum's location and scale to subpixel
// accuracy to form an image feature. Rejects features with low contrast.
// Based on Section 4 of Lowe's paper.
- template<typename T>
- void interpolateExtrema(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- unsigned* counter,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const unsigned extrema_feat,
- std::vector< Array<T> >& dog_pyr,
- const unsigned max_feat,
- const unsigned octave,
- const unsigned n_layers,
- const float contrast_thr,
- const float edge_thr,
- const float sigma,
- const float img_scale)
- {
- for (int f = 0; f < (int)extrema_feat; f++) {
- const float first_deriv_scale = img_scale*0.5f;
- const float second_deriv_scale = img_scale;
- const float cross_deriv_scale = img_scale*0.25f;
-
- float xl = 0, xy = 0, xx = 0, contr = 0;
- int i = 0;
-
- unsigned x = x_in[f];
- unsigned y = y_in[f];
- unsigned layer = layer_in[f];
-
- const T* prev_ptr = dog_pyr[octave*(n_layers+2) + layer-1].get();
- const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
- const T* next_ptr = dog_pyr[octave*(n_layers+2) + layer+1].get();
-
- af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
-
- bool converges = true;
-
- for (i = 0; i < MaxInterpSteps; i++) {
- float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
- (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
- (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
-
- float d2 = CPTR(x, y) * 2.f;
- float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
- float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
- float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale;
- float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
- CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
- float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
- PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
- float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
- PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
-
- float H[9] = {dxx, dxy, dxs,
- dxy, dyy, dys,
- dxs, dys, dss};
-
- float X[3];
- gaussianElimination<3>(H, dD, X);
-
- xl = -X[2];
- xy = -X[1];
- xx = -X[0];
-
- if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
- break;
-
- x += round(xx);
- y += round(xy);
- layer += round(xl);
-
- if (layer < 1 || layer > n_layers ||
- x < ImgBorder || x >= idims[1] - ImgBorder ||
- y < ImgBorder || y >= idims[0] - ImgBorder) {
- converges = false;
- break;
- }
- }
+template<typename T>
+void interpolateExtrema(
+ float* x_out,
+ float* y_out,
+ unsigned* layer_out,
+ float* response_out,
+ float* size_out,
+ unsigned* counter,
+ const float* x_in,
+ const float* y_in,
+ const unsigned* layer_in,
+ const unsigned extrema_feat,
+ std::vector< Array<T> >& dog_pyr,
+ const unsigned max_feat,
+ const unsigned octave,
+ const unsigned n_layers,
+ const float contrast_thr,
+ const float edge_thr,
+ const float sigma,
+ const float img_scale)
+{
+ for (int f = 0; f < (int)extrema_feat; f++) {
+ const float first_deriv_scale = img_scale*0.5f;
+ const float second_deriv_scale = img_scale;
+ const float cross_deriv_scale = img_scale*0.25f;
- // ensure convergence of interpolation
- if (i >= MaxInterpSteps || !converges)
- continue;
+ float xl = 0, xy = 0, xx = 0, contr = 0;
+ int i = 0;
+
+ unsigned x = x_in[f];
+ unsigned y = y_in[f];
+ unsigned layer = layer_in[f];
+ const T* prev_ptr = dog_pyr[octave*(n_layers+2) + layer-1].get();
+ const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get();
+ const T* next_ptr = dog_pyr[octave*(n_layers+2) + layer+1].get();
+
+ af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims();
+
+ bool converges = true;
+
+ for (i = 0; i < MaxInterpSteps; i++) {
float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
(float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
(float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
- float X[3] = {xx, xy, xl};
- float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
-
- contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
- if(abs(contr) < (contrast_thr / n_layers))
- continue;
-
- // principal curvatures are computed using the trace and det of Hessian
float d2 = CPTR(x, y) * 2.f;
float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
+ float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale;
float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
+ float dxs = (NPTR(x+1, y) - NPTR(x-1, y) -
+ PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale;
+ float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) -
+ PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale;
+
+ float H[9] = {dxx, dxy, dxs,
+ dxy, dyy, dys,
+ dxs, dys, dss};
+
+ float X[3];
+ gaussianElimination<3>(H, dD, X);
+
+ xl = -X[2];
+ xy = -X[1];
+ xx = -X[0];
+
+ if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f)
+ break;
+
+ x += round(xx);
+ y += round(xy);
+ layer += round(xl);
+
+ if (layer < 1 || layer > n_layers ||
+ x < ImgBorder || x >= idims[1] - ImgBorder ||
+ y < ImgBorder || y >= idims[0] - ImgBorder) {
+ converges = false;
+ break;
+ }
+ }
- float tr = dxx + dyy;
- float det = dxx * dyy - dxy * dxy;
+ // ensure convergence of interpolation
+ if (i >= MaxInterpSteps || !converges)
+ continue;
- // add FLT_EPSILON for double-precision compatibility
- if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
- continue;
+ float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale,
+ (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale,
+ (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale};
+ float X[3] = {xx, xy, xl};
- if (*counter < max_feat)
- {
- x_out[*counter] = (x + xx) * (1 << octave);
- y_out[*counter] = (y + xy) * (1 << octave);
- layer_out[*counter] = layer;
- response_out[*counter] = abs(contr);
- size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f;
- (*counter)++;
- }
+ float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2];
+
+ contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f;
+ if(abs(contr) < (contrast_thr / n_layers))
+ continue;
+
+ // principal curvatures are computed using the trace and det of Hessian
+ float d2 = CPTR(x, y) * 2.f;
+ float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale;
+ float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale;
+ float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) -
+ CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale;
+
+ float tr = dxx + dyy;
+ float det = dxx * dyy - dxy * dxy;
+
+ // add FLT_EPSILON for double-precision compatibility
+ if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON)
+ continue;
+
+ if (*counter < max_feat)
+ {
+ x_out[*counter] = (x + xx) * (1 << octave);
+ y_out[*counter] = (y + xy) * (1 << octave);
+ layer_out[*counter] = layer;
+ response_out[*counter] = abs(contr);
+ size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f;
+ (*counter)++;
}
}
+}
#undef CPTR
#undef PPTR
#undef NPTR
// Remove duplicate keypoints
- void removeDuplicates(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- unsigned* counter,
- const std::vector<feat_t>& sorted_feat)
- {
- size_t nfeat = sorted_feat.size();
-
- for (size_t f = 0; f < nfeat; f++) {
- float prec_fctr = 1e4f;
-
- if (f < nfeat-1) {
- if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
- round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
- round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
- round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
- sorted_feat[f].l == sorted_feat[f+1].l)
- continue;
- }
+void removeDuplicates(
+ float* x_out,
+ float* y_out,
+ unsigned* layer_out,
+ float* response_out,
+ float* size_out,
+ unsigned* counter,
+ const std::vector<feat_t>& sorted_feat)
+{
+ size_t nfeat = sorted_feat.size();
- x_out[*counter] = sorted_feat[f].f[0];
- y_out[*counter] = sorted_feat[f].f[1];
- response_out[*counter] = sorted_feat[f].f[2];
- size_out[*counter] = sorted_feat[f].f[3];
- layer_out[*counter] = sorted_feat[f].l;
- (*counter)++;
+ for (size_t f = 0; f < nfeat; f++) {
+ float prec_fctr = 1e4f;
+
+ if (f < nfeat-1) {
+ if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) &&
+ round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) &&
+ round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) &&
+ round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) &&
+ sorted_feat[f].l == sorted_feat[f+1].l)
+ continue;
}
+
+ x_out[*counter] = sorted_feat[f].f[0];
+ y_out[*counter] = sorted_feat[f].f[1];
+ response_out[*counter] = sorted_feat[f].f[2];
+ size_out[*counter] = sorted_feat[f].f[3];
+ layer_out[*counter] = sorted_feat[f].l;
+ (*counter)++;
}
+}
#define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)])
// Computes a canonical orientation for each image feature in an array. Based
// on Section 5 of Lowe's paper. This function adds features to the array when
// there is more than one dominant orientation at a given feature location.
- template<typename T>
- void calcOrientation(
- float* x_out,
- float* y_out,
- unsigned* layer_out,
- float* response_out,
- float* size_out,
- float* ori_out,
- unsigned* counter,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const unsigned total_feat,
- const std::vector< Array<T> >& gauss_pyr,
- const unsigned max_feat,
- const unsigned octave,
- const unsigned n_layers,
- const bool double_input)
- {
- const int n = OriHistBins;
+template<typename T>
+void calcOrientation(
+ float* x_out,
+ float* y_out,
+ unsigned* layer_out,
+ float* response_out,
+ float* size_out,
+ float* ori_out,
+ unsigned* counter,
+ const float* x_in,
+ const float* y_in,
+ const unsigned* layer_in,
+ const float* response_in,
+ const float* size_in,
+ const unsigned total_feat,
+ const std::vector< Array<T> >& gauss_pyr,
+ const unsigned max_feat,
+ const unsigned octave,
+ const unsigned n_layers,
+ const bool double_input)
+{
+ const int n = OriHistBins;
- float hist[OriHistBins];
- float temphist[OriHistBins];
+ float hist[OriHistBins];
+ float temphist[OriHistBins];
- for (unsigned f = 0; f < total_feat; f++) {
- // Load keypoint information
- const float real_x = x_in[f];
- const float real_y = y_in[f];
- const unsigned layer = layer_in[f];
- const float response = response_in[f];
- const float size = size_in[f];
+ for (unsigned f = 0; f < total_feat; f++) {
+ // Load keypoint information
+ const float real_x = x_in[f];
+ const float real_y = y_in[f];
+ const unsigned layer = layer_in[f];
+ const float response = response_in[f];
+ const float size = size_in[f];
- const int pt_x = (int)round(real_x / (1 << octave));
- const int pt_y = (int)round(real_y / (1 << octave));
+ const int pt_x = (int)round(real_x / (1 << octave));
+ const int pt_y = (int)round(real_y / (1 << octave));
- // Calculate auxiliary parameters
- const float scl_octv = size*0.5f / (1 << octave);
- const int radius = (int)round(OriRadius * scl_octv);
- const float sigma = OriSigFctr * scl_octv;
- const int len = (radius*2+1);
- const float exp_denom = 2.f * sigma * sigma;
+ // Calculate auxiliary parameters
+ const float scl_octv = size*0.5f / (1 << octave);
+ const int radius = (int)round(OriRadius * scl_octv);
+ const float sigma = OriSigFctr * scl_octv;
+ const int len = (radius*2+1);
+ const float exp_denom = 2.f * sigma * sigma;
- // Points img to correct Gaussian pyramid layer
- const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
- const T* img_ptr = img.get();
+ // Points img to correct Gaussian pyramid layer
+ const Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+ const T* img_ptr = img.get();
- for (int i = 0; i < OriHistBins; i++)
- hist[i] = 0.f;
+ for (int i = 0; i < OriHistBins; i++)
+ hist[i] = 0.f;
- af::dim4 idims = img.dims();
+ af::dim4 idims = img.dims();
- // Calculate orientation histogram
- for (int l = 0; l < len*len; l++) {
- int i = l / len - radius;
- int j = l % len - radius;
+ // Calculate orientation histogram
+ for (int l = 0; l < len*len; l++) {
+ int i = l / len - radius;
+ int j = l % len - radius;
- int y = pt_y + i;
- int x = pt_x + j;
- if (y < 1 || y >= idims[0] - 1 ||
- x < 1 || x >= idims[1] - 1)
- continue;
+ int y = pt_y + i;
+ int x = pt_x + j;
+ if (y < 1 || y >= idims[0] - 1 ||
+ x < 1 || x >= idims[1] - 1)
+ continue;
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+ float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+ float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
- float mag = sqrt(dx*dx+dy*dy);
- float ori = atan2(dy,dx);
- float w = exp(-(i*i + j*j)/exp_denom);
+ float mag = sqrt(dx*dx+dy*dy);
+ float ori = atan2(dy,dx);
+ float w = exp(-(i*i + j*j)/exp_denom);
- int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
- bin = bin < n ? bin : 0;
+ int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL));
+ bin = bin < n ? bin : 0;
- hist[bin] += w*mag;
- }
+ hist[bin] += w*mag;
+ }
- for (int i = 0; i < SmoothOriPasses; i++) {
- for (int j = 0; j < n; j++) {
- temphist[j] = hist[j];
- }
- for (int j = 0; j < n; j++) {
- float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
- float next = (j+1 == n) ? temphist[0] : temphist[j+1];
- hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
- }
+ for (int i = 0; i < SmoothOriPasses; i++) {
+ for (int j = 0; j < n; j++) {
+ temphist[j] = hist[j];
}
-
- float omax = hist[0];
- for (int i = 1; i < n; i++)
- omax = max(omax, hist[i]);
-
- float mag_thr = (float)(omax * OriPeakRatio);
- int l, r;
for (int j = 0; j < n; j++) {
- l = (j == 0) ? n - 1 : j - 1;
- r = (j + 1) % n;
- if (hist[j] > hist[l] &&
- hist[j] > hist[r] &&
- hist[j] >= mag_thr) {
- if (*counter < max_feat) {
- float bin = j + 0.5f * (hist[l] - hist[r]) /
- (hist[l] - 2.0f*hist[j] + hist[r]);
- bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
- float ori = 360.f - ((360.f/n) * bin);
-
- float new_real_x = real_x;
- float new_real_y = real_y;
- float new_size = size;
-
- if (double_input) {
- float scale = 0.5f;
- new_real_x *= scale;
- new_real_y *= scale;
- new_size *= scale;
- }
+ float prev = (j == 0) ? temphist[n-1] : temphist[j-1];
+ float next = (j+1 == n) ? temphist[0] : temphist[j+1];
+ hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next;
+ }
+ }
- x_out[*counter] = new_real_x;
- y_out[*counter] = new_real_y;
- layer_out[*counter] = layer;
- response_out[*counter] = response;
- size_out[*counter] = new_size;
- ori_out[*counter] = ori;
- (*counter)++;
+ float omax = hist[0];
+ for (int i = 1; i < n; i++)
+ omax = max(omax, hist[i]);
+
+ float mag_thr = (float)(omax * OriPeakRatio);
+ int l, r;
+ for (int j = 0; j < n; j++) {
+ l = (j == 0) ? n - 1 : j - 1;
+ r = (j + 1) % n;
+ if (hist[j] > hist[l] &&
+ hist[j] > hist[r] &&
+ hist[j] >= mag_thr) {
+ if (*counter < max_feat) {
+ float bin = j + 0.5f * (hist[l] - hist[r]) /
+ (hist[l] - 2.0f*hist[j] + hist[r]);
+ bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin;
+ float ori = 360.f - ((360.f/n) * bin);
+
+ float new_real_x = real_x;
+ float new_real_y = real_y;
+ float new_size = size;
+
+ if (double_input) {
+ float scale = 0.5f;
+ new_real_x *= scale;
+ new_real_y *= scale;
+ new_size *= scale;
}
+
+ x_out[*counter] = new_real_x;
+ y_out[*counter] = new_real_y;
+ layer_out[*counter] = layer;
+ response_out[*counter] = response;
+ size_out[*counter] = new_size;
+ ori_out[*counter] = ori;
+ (*counter)++;
}
}
}
}
+}
- void normalizeDesc(
- float* desc,
- const int histlen)
- {
- float len_sq = 0.0f;
+void normalizeDesc(
+ float* desc,
+ const int histlen)
+{
+ float len_sq = 0.0f;
- for (int i = 0; i < histlen; i++)
- len_sq += desc[i] * desc[i];
+ for (int i = 0; i < histlen; i++)
+ len_sq += desc[i] * desc[i];
- float len_inv = 1.0f / sqrt(len_sq);
+ float len_inv = 1.0f / sqrt(len_sq);
- for (int i = 0; i < histlen; i++) {
- desc[i] *= len_inv;
- }
+ for (int i = 0; i < histlen; i++) {
+ desc[i] *= len_inv;
}
+}
// Computes feature descriptors for features in an array. Based on Section 6
// of Lowe's paper.
- template<typename T>
- void computeDescriptor(
- float* desc_out,
- const unsigned desc_len,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const float* ori_in,
- const unsigned total_feat,
- const std::vector< Array<T> >& gauss_pyr,
- const int d,
- const int n,
- const float scale,
- const unsigned octave,
- const unsigned n_layers)
- {
- float desc[128];
-
- for (unsigned f = 0; f < total_feat; f++) {
- const unsigned layer = layer_in[f];
- float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
- ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
- const float size = size_in[f];
- const int fx = round(x_in[f] * scale);
- const int fy = round(y_in[f] * scale);
-
- // Points img to correct Gaussian pyramid layer
- Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
- const T* img_ptr = img.get();
- af::dim4 idims = img.dims();
-
- float cos_t = cos(ori);
- float sin_t = sin(ori);
- float bins_per_rad = n / (PI_VAL * 2.f);
- float exp_denom = d * d * 0.5f;
- float hist_width = DescrSclFctr * size * scale * 0.5f;
- int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
- int len = radius*2+1;
-
- for (int i = 0; i < (int)desc_len; i++)
- desc[i] = 0.f;
-
- // Calculate orientation histogram
- for (int l = 0; l < len*len; l++) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = fy + i;
- int x = fx + j;
-
- float x_rot = (j * cos_t - i * sin_t) / hist_width;
- float y_rot = (j * sin_t + i * cos_t) / hist_width;
- float xbin = x_rot + d/2 - 0.5f;
- float ybin = y_rot + d/2 - 0.5f;
-
- if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
- y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float grad_mag = sqrt(dx*dx + dy*dy);
- float grad_ori = atan2(dy, dx) - ori;
- while (grad_ori < 0.0f)
- grad_ori += PI_VAL*2;
- while (grad_ori >= PI_VAL*2)
- grad_ori -= PI_VAL*2;
-
- float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
- float obin = grad_ori * bins_per_rad;
- float mag = grad_mag*w;
-
- int x0 = floor(xbin);
- int y0 = floor(ybin);
- int o0 = floor(obin);
- xbin -= x0;
- ybin -= y0;
- obin -= o0;
-
- for (int yl = 0; yl <= 1; yl++) {
- int yb = y0 + yl;
- if (yb >= 0 && yb < d) {
- float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
- for (int xl = 0; xl <= 1; xl++) {
- int xb = x0 + xl;
- if (xb >= 0 && xb < d) {
- float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
- for (int ol = 0; ol <= 1; ol++) {
- int ob = (o0 + ol) % n;
- float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
- desc[(yb*d + xb)*n + ob] += v_o;
- }
- }
- }
- }
- }
- }
- }
+template<typename T>
+void computeDescriptor(
+ float* desc_out,
+ const unsigned desc_len,
+ const float* x_in,
+ const float* y_in,
+ const unsigned* layer_in,
+ const float* response_in,
+ const float* size_in,
+ const float* ori_in,
+ const unsigned total_feat,
+ const std::vector< Array<T> >& gauss_pyr,
+ const int d,
+ const int n,
+ const float scale,
+ const unsigned octave,
+ const unsigned n_layers)
+{
+ float desc[128];
+
+ for (unsigned f = 0; f < total_feat; f++) {
+ const unsigned layer = layer_in[f];
+ float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+ ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+ const float size = size_in[f];
+ const int fx = round(x_in[f] * scale);
+ const int fy = round(y_in[f] * scale);
+
+ // Points img to correct Gaussian pyramid layer
+ Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+ const T* img_ptr = img.get();
+ af::dim4 idims = img.dims();
- normalizeDesc(desc, desc_len);
+ float cos_t = cos(ori);
+ float sin_t = sin(ori);
+ float bins_per_rad = n / (PI_VAL * 2.f);
+ float exp_denom = d * d * 0.5f;
+ float hist_width = DescrSclFctr * size * scale * 0.5f;
+ int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
- for (int i = 0; i < (int)desc_len; i++)
- desc[i] = min(desc[i], DescrMagThr);
+ int len = radius*2+1;
- normalizeDesc(desc, desc_len);
+ for (int i = 0; i < (int)desc_len; i++)
+ desc[i] = 0.f;
- // Calculate final descriptor values
- for (int k = 0; k < (int)desc_len; k++) {
- desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
- }
- }
- }
+ // Calculate orientation histogram
+ for (int l = 0; l < len*len; l++) {
+ int i = l / len - radius;
+ int j = l % len - radius;
-// Computes GLOH feature descriptors for features in an array. Based on Section III-B
-// of Mikolajczyk and Schmid paper.
- template<typename T>
- void computeGLOHDescriptor(
- float* desc_out,
- const unsigned desc_len,
- const float* x_in,
- const float* y_in,
- const unsigned* layer_in,
- const float* response_in,
- const float* size_in,
- const float* ori_in,
- const unsigned total_feat,
- const std::vector< Array<T> >& gauss_pyr,
- const int d,
- const unsigned rb,
- const unsigned ab,
- const unsigned hb,
- const float scale,
- const unsigned octave,
- const unsigned n_layers)
- {
- float desc[272];
-
- for (unsigned f = 0; f < total_feat; f++) {
- const unsigned layer = layer_in[f];
- float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
- ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
- const float size = size_in[f];
- const int fx = round(x_in[f] * scale);
- const int fy = round(y_in[f] * scale);
-
- // Points img to correct Gaussian pyramid layer
- Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
- const T* img_ptr = img.get();
- af::dim4 idims = img.dims();
-
- float cos_t = cos(ori);
- float sin_t = sin(ori);
- float hist_bins_per_rad = hb / (PI_VAL * 2.f);
- float polar_bins_per_rad = ab / (PI_VAL * 2.f);
- float exp_denom = GLOHRadii[rb-1] * 0.5f;
-
- float hist_width = DescrSclFctr * size * scale * 0.5f;
-
- // Keep same descriptor radius used for SIFT
- int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
-
- // Alternative radius size calculation, changing the radius weight
- // (rw) in the range of 0.25f-0.75f gives different results,
- // increasing it tends to show a better recall rate but with a
- // smaller amount of correct matches
- //float rw = 0.5f;
- //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
-
- int len = radius*2+1;
-
- for (int i = 0; i < (int)desc_len; i++)
- desc[i] = 0.f;
-
- // Calculate orientation histogram
- for (int l = 0; l < len*len; l++) {
- int i = l / len - radius;
- int j = l % len - radius;
-
- int y = fy + i;
- int x = fx + j;
-
- float x_rot = (j * cos_t - i * sin_t);
- float y_rot = (j * sin_t + i * cos_t);
-
- float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
- float theta = atan2(y_rot, x_rot);
- while (theta < 0.0f)
- theta += PI_VAL*2;
- while (theta >= PI_VAL*2)
- theta -= PI_VAL*2;
-
- float tbin = theta * polar_bins_per_rad;
- float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
- ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
- min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
-
- if (r <= GLOHRadii[rb-1] &&
- y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
- float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
- float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
-
- float grad_mag = sqrt(dx*dx + dy*dy);
- float grad_ori = atan2(dy, dx) - ori;
- while (grad_ori < 0.0f)
- grad_ori += PI_VAL*2;
- while (grad_ori >= PI_VAL*2)
- grad_ori -= PI_VAL*2;
-
- float w = exp(-r / exp_denom);
- float obin = grad_ori * hist_bins_per_rad;
- float mag = grad_mag*w;
-
- int t0 = floor(tbin);
- int r0 = floor(rbin);
- int o0 = floor(obin);
- tbin -= t0;
- rbin -= r0;
- obin -= o0;
-
- for (int rl = 0; rl <= 1; rl++) {
- int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
- float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
- if (rb >= 0 && rb <= 2) {
- for (int tl = 0; tl <= 1; tl++) {
- int tb = (t0 + tl) % ab;
- float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+ int y = fy + i;
+ int x = fx + j;
+
+ float x_rot = (j * cos_t - i * sin_t) / hist_width;
+ float y_rot = (j * sin_t + i * cos_t) / hist_width;
+ float xbin = x_rot + d/2 - 0.5f;
+ float ybin = y_rot + d/2 - 0.5f;
+
+ if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d &&
+ y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
+ float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+ float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+ float grad_mag = sqrt(dx*dx + dy*dy);
+ float grad_ori = atan2(dy, dx) - ori;
+ while (grad_ori < 0.0f)
+ grad_ori += PI_VAL*2;
+ while (grad_ori >= PI_VAL*2)
+ grad_ori -= PI_VAL*2;
+
+ float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom);
+ float obin = grad_ori * bins_per_rad;
+ float mag = grad_mag*w;
+
+ int x0 = floor(xbin);
+ int y0 = floor(ybin);
+ int o0 = floor(obin);
+ xbin -= x0;
+ ybin -= y0;
+ obin -= o0;
+
+ for (int yl = 0; yl <= 1; yl++) {
+ int yb = y0 + yl;
+ if (yb >= 0 && yb < d) {
+ float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin);
+ for (int xl = 0; xl <= 1; xl++) {
+ int xb = x0 + xl;
+ if (xb >= 0 && xb < d) {
+ float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin);
for (int ol = 0; ol <= 1; ol++) {
- int ob = (o0 + ol) % hb;
- float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
- unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
- desc[idx] += v_o;
+ int ob = (o0 + ol) % n;
+ float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin);
+ desc[(yb*d + xb)*n + ob] += v_o;
}
}
}
}
}
}
+ }
- normalizeDesc(desc, desc_len);
+ normalizeDesc(desc, desc_len);
- for (int i = 0; i < (int)desc_len; i++)
- desc[i] = min(desc[i], DescrMagThr);
+ for (int i = 0; i < (int)desc_len; i++)
+ desc[i] = min(desc[i], DescrMagThr);
- normalizeDesc(desc, desc_len);
+ normalizeDesc(desc, desc_len);
- // Calculate final descriptor values
- for (int k = 0; k < (int)desc_len; k++) {
- desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
- }
+ // Calculate final descriptor values
+ for (int k = 0; k < (int)desc_len; k++) {
+ desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
}
}
+}
-#undef IPTR
-
- template<typename T, typename convAccT>
- Array<T> createInitialImage(
- const Array<T>& img,
- const float init_sigma,
- const bool double_input)
- {
+// Computes GLOH feature descriptors for features in an array. Based on Section III-B
+// of Mikolajczyk and Schmid paper.
+template<typename T>
+void computeGLOHDescriptor(
+ float* desc_out,
+ const unsigned desc_len,
+ const float* x_in,
+ const float* y_in,
+ const unsigned* layer_in,
+ const float* response_in,
+ const float* size_in,
+ const float* ori_in,
+ const unsigned total_feat,
+ const std::vector< Array<T> >& gauss_pyr,
+ const int d,
+ const unsigned rb,
+ const unsigned ab,
+ const unsigned hb,
+ const float scale,
+ const unsigned octave,
+ const unsigned n_layers)
+{
+ float desc[272];
+
+ for (unsigned f = 0; f < total_feat; f++) {
+ const unsigned layer = layer_in[f];
+ float ori = (360.f - ori_in[f]) * PI_VAL / 180.f;
+ ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori;
+ const float size = size_in[f];
+ const int fx = round(x_in[f] * scale);
+ const int fy = round(y_in[f] * scale);
+
+ // Points img to correct Gaussian pyramid layer
+ Array<T> img = gauss_pyr[octave*(n_layers+3) + layer];
+ const T* img_ptr = img.get();
af::dim4 idims = img.dims();
- Array<T> init_img = createEmptyArray<T>(af::dim4());
+ float cos_t = cos(ori);
+ float sin_t = sin(ori);
+ float hist_bins_per_rad = hb / (PI_VAL * 2.f);
+ float polar_bins_per_rad = ab / (PI_VAL * 2.f);
+ float exp_denom = GLOHRadii[rb-1] * 0.5f;
- float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f)
- : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f);
+ float hist_width = DescrSclFctr * size * scale * 0.5f;
- Array<T> filter = gauss_filter<T>(s);
+ // Keep same descriptor radius used for SIFT
+ int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f;
- if (double_input) {
- Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
- init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
- }
- else {
- init_img = convolve2<T, convAccT, false>(img, filter, filter);
- }
+ // Alternative radius size calculation, changing the radius weight
+ // (rw) in the range of 0.25f-0.75f gives different results,
+ // increasing it tends to show a better recall rate but with a
+ // smaller amount of correct matches
+ //float rw = 0.5f;
+ //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f;
- return init_img;
- }
+ int len = radius*2+1;
- template<typename T, typename convAccT>
- std::vector< Array<T> > buildGaussPyr(
- const Array<T>& init_img,
- const unsigned n_octaves,
- const unsigned n_layers,
- const float init_sigma)
- {
- // Precompute Gaussian sigmas using the following formula:
- // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
- std::vector<float> sig_layers(n_layers + 3);
- sig_layers[0] = init_sigma;
- float k = std::pow(2.0f, 1.0f / n_layers);
- for (unsigned i = 1; i < n_layers + 3; i++) {
- float sig_prev = std::pow(k, i-1) * init_sigma;
- float sig_total = sig_prev * k;
- sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
- }
+ for (int i = 0; i < (int)desc_len; i++)
+ desc[i] = 0.f;
- // Gaussian Pyramid
- std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
- for (unsigned o = 0; o < n_octaves; o++) {
- for (unsigned l = 0; l < n_layers+3; l++) {
- unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
- unsigned idx = o*(n_layers+3) + l;
+ // Calculate orientation histogram
+ for (int l = 0; l < len*len; l++) {
+ int i = l / len - radius;
+ int j = l % len - radius;
- if (o == 0 && l == 0) {
- gauss_pyr[idx] = init_img;
- }
- else if (l == 0) {
- af::dim4 sdims = gauss_pyr[src_idx].dims();
- gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
- }
- else {
- Array<T> filter = gauss_filter<T>(sig_layers[l]);
+ int y = fy + i;
+ int x = fx + j;
+
+ float x_rot = (j * cos_t - i * sin_t);
+ float y_rot = (j * sin_t + i * cos_t);
+
+ float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1];
+ float theta = atan2(y_rot, x_rot);
+ while (theta < 0.0f)
+ theta += PI_VAL*2;
+ while (theta >= PI_VAL*2)
+ theta -= PI_VAL*2;
+
+ float tbin = theta * polar_bins_per_rad;
+ float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] :
+ ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) :
+ min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON));
- gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
+ if (r <= GLOHRadii[rb-1] &&
+ y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) {
+ float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y));
+ float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1));
+
+ float grad_mag = sqrt(dx*dx + dy*dy);
+ float grad_ori = atan2(dy, dx) - ori;
+ while (grad_ori < 0.0f)
+ grad_ori += PI_VAL*2;
+ while (grad_ori >= PI_VAL*2)
+ grad_ori -= PI_VAL*2;
+
+ float w = exp(-r / exp_denom);
+ float obin = grad_ori * hist_bins_per_rad;
+ float mag = grad_mag*w;
+
+ int t0 = floor(tbin);
+ int r0 = floor(rbin);
+ int o0 = floor(obin);
+ tbin -= t0;
+ rbin -= r0;
+ obin -= o0;
+
+ for (int rl = 0; rl <= 1; rl++) {
+ int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl);
+ float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin);
+ if (rb >= 0 && rb <= 2) {
+ for (int tl = 0; tl <= 1; tl++) {
+ int tb = (t0 + tl) % ab;
+ float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin);
+ for (int ol = 0; ol <= 1; ol++) {
+ int ob = (o0 + ol) % hb;
+ float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin);
+ unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob;
+ desc[idx] += v_o;
+ }
+ }
+ }
}
}
}
- return gauss_pyr;
- }
+ normalizeDesc(desc, desc_len);
- template<typename T>
- std::vector< Array<T> > buildDoGPyr(
- std::vector< Array<T> >& gauss_pyr,
- const unsigned n_octaves,
- const unsigned n_layers)
- {
- // DoG Pyramid
- std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
- for (unsigned o = 0; o < n_octaves; o++) {
- for (unsigned l = 0; l < n_layers+2; l++) {
- unsigned idx = o*(n_layers+2) + l;
- unsigned bottom = o*(n_layers+3) + l;
- unsigned top = o*(n_layers+3) + l+1;
+ for (int i = 0; i < (int)desc_len; i++)
+ desc[i] = min(desc[i], DescrMagThr);
- dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
+ normalizeDesc(desc, desc_len);
- sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
- }
+ // Calculate final descriptor values
+ for (int k = 0; k < (int)desc_len; k++) {
+ desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr));
}
-
- return dog_pyr;
}
+}
+#undef IPTR
- template<typename T, typename convAccT>
- unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
- Array<float>& ori, Array<float>& size, Array<float>& desc,
- const Array<T>& in, const unsigned n_layers,
- const float contrast_thr, const float edge_thr,
- const float init_sigma, const bool double_input,
- const float img_scale, const float feature_ratio,
- const bool compute_GLOH)
- {
- in.eval();
- af::dim4 idims = in.dims();
+template<typename T, typename convAccT>
+Array<T> createInitialImage(
+ const Array<T>& img,
+ const float init_sigma,
+ const bool double_input)
+{
+ af::dim4 idims = img.dims();
- const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
- : min(idims[0], idims[1]);
- const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
+ Array<T> init_img = createEmptyArray<T>(af::dim4());
- Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
+ float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f)
+ : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f);
- std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
+ Array<T> filter = gauss_filter<T>(s);
- std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
+ if (double_input) {
+ Array<T> double_img = resize<T>(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR);
+ init_img = convolve2<T, convAccT, false>(double_img, filter, filter);
+ }
+ else {
+ init_img = convolve2<T, convAccT, false>(img, filter, filter);
+ }
- std::vector<float*> x_pyr(n_octaves, NULL);
- std::vector<float*> y_pyr(n_octaves, NULL);
- std::vector<float*> response_pyr(n_octaves, NULL);
- std::vector<float*> size_pyr(n_octaves, NULL);
- std::vector<float*> ori_pyr(n_octaves, NULL);
- std::vector<float*> desc_pyr(n_octaves, NULL);
- std::vector<unsigned> feat_pyr(n_octaves, 0);
- unsigned total_feat = 0;
+ return init_img;
+}
- const unsigned d = DescrWidth;
- const unsigned n = DescrHistBins;
- const unsigned rb = GLOHRadialBins;
- const unsigned ab = GLOHAngularBins;
- const unsigned hb = GLOHHistBins;
- const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
+template<typename T, typename convAccT>
+std::vector< Array<T> > buildGaussPyr(
+ const Array<T>& init_img,
+ const unsigned n_octaves,
+ const unsigned n_layers,
+ const float init_sigma)
+{
+ // Precompute Gaussian sigmas using the following formula:
+ // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
+ std::vector<float> sig_layers(n_layers + 3);
+ sig_layers[0] = init_sigma;
+ float k = std::pow(2.0f, 1.0f / n_layers);
+ for (unsigned i = 1; i < n_layers + 3; i++) {
+ float sig_prev = std::pow(k, i-1) * init_sigma;
+ float sig_total = sig_prev * k;
+ sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
+ }
- for (unsigned i = 0; i < n_octaves; i++) {
- af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
- if (ddims[0]-2*ImgBorder < 1 ||
- ddims[1]-2*ImgBorder < 1)
- continue;
+ // Gaussian Pyramid
+ std::vector< Array<T> > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray<T>(af::dim4()));
+ for (unsigned o = 0; o < n_octaves; o++) {
+ for (unsigned l = 0; l < n_layers+3; l++) {
+ unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1;
+ unsigned idx = o*(n_layers+3) + l;
- const unsigned imel = ddims[0] * ddims[1];
- const unsigned max_feat = ceil(imel * feature_ratio);
+ if (o == 0 && l == 0) {
+ gauss_pyr[idx] = init_img;
+ }
+ else if (l == 0) {
+ af::dim4 sdims = gauss_pyr[src_idx].dims();
+ gauss_pyr[idx] = resize<T>(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR);
+ }
+ else {
+ Array<T> filter = gauss_filter<T>(sig_layers[l]);
- float* extrema_x = memAlloc<float>(max_feat);
- float* extrema_y = memAlloc<float>(max_feat);
- unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
- unsigned extrema_feat = 0;
+ gauss_pyr[idx] = convolve2<T, convAccT, false>(gauss_pyr[src_idx], filter, filter);
+ }
+ }
+ }
- for (unsigned j = 1; j <= n_layers; j++) {
- unsigned prev = i*(n_layers+2) + j-1;
- unsigned center = i*(n_layers+2) + j;
- unsigned next = i*(n_layers+2) + j+1;
+ return gauss_pyr;
+}
- unsigned layer = j;
+template<typename T>
+std::vector< Array<T> > buildDoGPyr(
+ std::vector< Array<T> >& gauss_pyr,
+ const unsigned n_octaves,
+ const unsigned n_layers)
+{
+ // DoG Pyramid
+ std::vector< Array<T> > dog_pyr(n_octaves * (n_layers+2), createEmptyArray<T>(af::dim4()));
+ for (unsigned o = 0; o < n_octaves; o++) {
+ for (unsigned l = 0; l < n_layers+2; l++) {
+ unsigned idx = o*(n_layers+2) + l;
+ unsigned bottom = o*(n_layers+3) + l;
+ unsigned top = o*(n_layers+3) + l+1;
- float extrema_thr = 0.5f * contrast_thr / n_layers;
- detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
- dog_pyr[prev], dog_pyr[center], dog_pyr[next],
- layer, max_feat, extrema_thr);
- }
+ dog_pyr[idx] = createEmptyArray<T>(gauss_pyr[bottom].dims());
- extrema_feat = min(extrema_feat, max_feat);
+ sub<T>(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]);
+ }
+ }
- if (extrema_feat == 0) {
- memFree(extrema_x);
- memFree(extrema_y);
- memFree(extrema_layer);
+ return dog_pyr;
+}
- continue;
- }
- unsigned interp_feat = 0;
+template<typename T, typename convAccT>
+unsigned sift_impl(Array<float>& x, Array<float>& y, Array<float>& score,
+ Array<float>& ori, Array<float>& size, Array<float>& desc,
+ const Array<T>& in, const unsigned n_layers,
+ const float contrast_thr, const float edge_thr,
+ const float init_sigma, const bool double_input,
+ const float img_scale, const float feature_ratio,
+ const bool compute_GLOH)
+{
+ in.eval();
+ af::dim4 idims = in.dims();
+
+ const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2)
+ : min(idims[0], idims[1]);
+ const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2;
+
+ Array<T> init_img = createInitialImage<T, convAccT>(in, init_sigma, double_input);
+
+ std::vector< Array<T> > gauss_pyr = buildGaussPyr<T, convAccT>(init_img, n_octaves, n_layers, init_sigma);
+
+ std::vector< Array<T> > dog_pyr = buildDoGPyr<T>(gauss_pyr, n_octaves, n_layers);
+
+ std::vector<float*> x_pyr(n_octaves, NULL);
+ std::vector<float*> y_pyr(n_octaves, NULL);
+ std::vector<float*> response_pyr(n_octaves, NULL);
+ std::vector<float*> size_pyr(n_octaves, NULL);
+ std::vector<float*> ori_pyr(n_octaves, NULL);
+ std::vector<float*> desc_pyr(n_octaves, NULL);
+ std::vector<unsigned> feat_pyr(n_octaves, 0);
+ unsigned total_feat = 0;
+
+ const unsigned d = DescrWidth;
+ const unsigned n = DescrHistBins;
+ const unsigned rb = GLOHRadialBins;
+ const unsigned ab = GLOHAngularBins;
+ const unsigned hb = GLOHHistBins;
+ const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n;
+
+ for (unsigned i = 0; i < n_octaves; i++) {
+ af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims();
+ if (ddims[0]-2*ImgBorder < 1 ||
+ ddims[1]-2*ImgBorder < 1)
+ continue;
+
+ const unsigned imel = ddims[0] * ddims[1];
+ const unsigned max_feat = ceil(imel * feature_ratio);
+
+ float* extrema_x = memAlloc<float>(max_feat);
+ float* extrema_y = memAlloc<float>(max_feat);
+ unsigned* extrema_layer = memAlloc<unsigned>(max_feat);
+ unsigned extrema_feat = 0;
+
+ for (unsigned j = 1; j <= n_layers; j++) {
+ unsigned prev = i*(n_layers+2) + j-1;
+ unsigned center = i*(n_layers+2) + j;
+ unsigned next = i*(n_layers+2) + j+1;
+
+ unsigned layer = j;
+
+ float extrema_thr = 0.5f * contrast_thr / n_layers;
+ detectExtrema<T>(extrema_x, extrema_y, extrema_layer, &extrema_feat,
+ dog_pyr[prev], dog_pyr[center], dog_pyr[next],
+ layer, max_feat, extrema_thr);
+ }
+
+ extrema_feat = min(extrema_feat, max_feat);
- float* interp_x = memAlloc<float>(extrema_feat);
- float* interp_y = memAlloc<float>(extrema_feat);
- unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
- float* interp_response = memAlloc<float>(extrema_feat);
- float* interp_size = memAlloc<float>(extrema_feat);
+ if (extrema_feat == 0) {
+ memFree(extrema_x);
+ memFree(extrema_y);
+ memFree(extrema_layer);
- interpolateExtrema<T>(interp_x, interp_y, interp_layer,
- interp_response, interp_size, &interp_feat,
- extrema_x, extrema_y, extrema_layer, extrema_feat,
- dog_pyr, max_feat, i, n_layers,
- contrast_thr, edge_thr, init_sigma, img_scale);
+ continue;
+ }
- interp_feat = min(interp_feat, max_feat);
+ unsigned interp_feat = 0;
- if (interp_feat == 0) {
- memFree(interp_x);
- memFree(interp_y);
- memFree(interp_layer);
- memFree(interp_response);
- memFree(interp_size);
+ float* interp_x = memAlloc<float>(extrema_feat);
+ float* interp_y = memAlloc<float>(extrema_feat);
+ unsigned* interp_layer = memAlloc<unsigned>(extrema_feat);
+ float* interp_response = memAlloc<float>(extrema_feat);
+ float* interp_size = memAlloc<float>(extrema_feat);
- continue;
- }
+ interpolateExtrema<T>(interp_x, interp_y, interp_layer,
+ interp_response, interp_size, &interp_feat,
+ extrema_x, extrema_y, extrema_layer, extrema_feat,
+ dog_pyr, max_feat, i, n_layers,
+ contrast_thr, edge_thr, init_sigma, img_scale);
- std::vector<feat_t> sorted_feat;
- array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
- std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
+ interp_feat = min(interp_feat, max_feat);
+ if (interp_feat == 0) {
memFree(interp_x);
memFree(interp_y);
memFree(interp_layer);
memFree(interp_response);
memFree(interp_size);
- unsigned nodup_feat = 0;
-
- float* nodup_x = memAlloc<float>(interp_feat);
- float* nodup_y = memAlloc<float>(interp_feat);
- unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
- float* nodup_response = memAlloc<float>(interp_feat);
- float* nodup_size = memAlloc<float>(interp_feat);
-
- removeDuplicates(nodup_x, nodup_y, nodup_layer,
- nodup_response, nodup_size, &nodup_feat,
- sorted_feat);
-
- const unsigned max_oriented_feat = nodup_feat * 3;
-
- float* oriented_x = memAlloc<float>(max_oriented_feat);
- float* oriented_y = memAlloc<float>(max_oriented_feat);
- unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
- float* oriented_response = memAlloc<float>(max_oriented_feat);
- float* oriented_size = memAlloc<float>(max_oriented_feat);
- float* oriented_ori = memAlloc<float>(max_oriented_feat);
-
- unsigned oriented_feat = 0;
-
- calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
- oriented_response, oriented_size, oriented_ori, &oriented_feat,
- nodup_x, nodup_y, nodup_layer,
- nodup_response, nodup_size, nodup_feat,
- gauss_pyr, max_oriented_feat, i, n_layers, double_input);
-
- memFree(nodup_x);
- memFree(nodup_y);
- memFree(nodup_layer);
- memFree(nodup_response);
- memFree(nodup_size);
-
- if (oriented_feat == 0) {
- memFree(oriented_x);
- memFree(oriented_y);
- memFree(oriented_layer);
- memFree(oriented_response);
- memFree(oriented_size);
- memFree(oriented_ori);
+ continue;
+ }
- continue;
- }
+ std::vector<feat_t> sorted_feat;
+ array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat);
+ std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp);
+
+ memFree(interp_x);
+ memFree(interp_y);
+ memFree(interp_layer);
+ memFree(interp_response);
+ memFree(interp_size);
+
+ unsigned nodup_feat = 0;
+
+ float* nodup_x = memAlloc<float>(interp_feat);
+ float* nodup_y = memAlloc<float>(interp_feat);
+ unsigned* nodup_layer = memAlloc<unsigned>(interp_feat);
+ float* nodup_response = memAlloc<float>(interp_feat);
+ float* nodup_size = memAlloc<float>(interp_feat);
+
+ removeDuplicates(nodup_x, nodup_y, nodup_layer,
+ nodup_response, nodup_size, &nodup_feat,
+ sorted_feat);
+
+ const unsigned max_oriented_feat = nodup_feat * 3;
+
+ float* oriented_x = memAlloc<float>(max_oriented_feat);
+ float* oriented_y = memAlloc<float>(max_oriented_feat);
+ unsigned* oriented_layer = memAlloc<unsigned>(max_oriented_feat);
+ float* oriented_response = memAlloc<float>(max_oriented_feat);
+ float* oriented_size = memAlloc<float>(max_oriented_feat);
+ float* oriented_ori = memAlloc<float>(max_oriented_feat);
+
+ unsigned oriented_feat = 0;
+
+ calcOrientation<T>(oriented_x, oriented_y, oriented_layer,
+ oriented_response, oriented_size, oriented_ori, &oriented_feat,
+ nodup_x, nodup_y, nodup_layer,
+ nodup_response, nodup_size, nodup_feat,
+ gauss_pyr, max_oriented_feat, i, n_layers, double_input);
+
+ memFree(nodup_x);
+ memFree(nodup_y);
+ memFree(nodup_layer);
+ memFree(nodup_response);
+ memFree(nodup_size);
+
+ if (oriented_feat == 0) {
+ memFree(oriented_x);
+ memFree(oriented_y);
+ memFree(oriented_layer);
+ memFree(oriented_response);
+ memFree(oriented_size);
+ memFree(oriented_ori);
+
+ continue;
+ }
- float* desc = memAlloc<float>(oriented_feat * desc_len);
+ float* desc = memAlloc<float>(oriented_feat * desc_len);
- float scale = 1.f/(1 << i);
- if (double_input) scale *= 2.f;
+ float scale = 1.f/(1 << i);
+ if (double_input) scale *= 2.f;
- if (compute_GLOH)
- computeGLOHDescriptor<T>(desc, desc_len,
- oriented_x, oriented_y, oriented_layer,
- oriented_response, oriented_size, oriented_ori,
- oriented_feat, gauss_pyr, d, rb, ab, hb,
- scale, i, n_layers);
- else
- computeDescriptor<T>(desc, desc_len,
+ if (compute_GLOH)
+ computeGLOHDescriptor<T>(desc, desc_len,
oriented_x, oriented_y, oriented_layer,
oriented_response, oriented_size, oriented_ori,
- oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
-
- total_feat += oriented_feat;
- feat_pyr[i] = oriented_feat;
-
- if (oriented_feat > 0) {
- x_pyr[i] = oriented_x;
- y_pyr[i] = oriented_y;
- response_pyr[i] = oriented_response;
- ori_pyr[i] = oriented_ori;
- size_pyr[i] = oriented_size;
- desc_pyr[i] = desc;
- }
+ oriented_feat, gauss_pyr, d, rb, ab, hb,
+ scale, i, n_layers);
+ else
+ computeDescriptor<T>(desc, desc_len,
+ oriented_x, oriented_y, oriented_layer,
+ oriented_response, oriented_size, oriented_ori,
+ oriented_feat, gauss_pyr, d, n, scale, i, n_layers);
+
+ total_feat += oriented_feat;
+ feat_pyr[i] = oriented_feat;
+
+ if (oriented_feat > 0) {
+ x_pyr[i] = oriented_x;
+ y_pyr[i] = oriented_y;
+ response_pyr[i] = oriented_response;
+ ori_pyr[i] = oriented_ori;
+ size_pyr[i] = oriented_size;
+ desc_pyr[i] = desc;
}
+ }
- if (total_feat > 0) {
- const af::dim4 total_feat_dims(total_feat);
- const af::dim4 desc_dims(desc_len, total_feat);
-
- // Allocate output memory
- x = createEmptyArray<float>(total_feat_dims);
- y = createEmptyArray<float>(total_feat_dims);
- score = createEmptyArray<float>(total_feat_dims);
- ori = createEmptyArray<float>(total_feat_dims);
- size = createEmptyArray<float>(total_feat_dims);
- desc = createEmptyArray<float>(desc_dims);
-
- float* x_ptr = x.get();
- float* y_ptr = y.get();
- float* score_ptr = score.get();
- float* ori_ptr = ori.get();
- float* size_ptr = size.get();
- float* desc_ptr = desc.get();
-
- unsigned offset = 0;
- for (unsigned i = 0; i < n_octaves; i++) {
- if (feat_pyr[i] == 0)
- continue;
-
- memcpy(x_ptr+offset, x_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(y_ptr+offset, y_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(ori_ptr+offset, ori_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(size_ptr+offset, size_pyr[i], feat_pyr[i] * sizeof(float));
-
- memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
-
- memFree(x_pyr[i]);
- memFree(y_pyr[i]);
- memFree(response_pyr[i]);
- memFree(ori_pyr[i]);
- memFree(size_pyr[i]);
- memFree(desc_pyr[i]);
-
- offset += feat_pyr[i];
- }
- }
+ if (total_feat > 0) {
+ const af::dim4 total_feat_dims(total_feat);
+ const af::dim4 desc_dims(desc_len, total_feat);
+
+ // Allocate output memory
+ x = createEmptyArray<float>(total_feat_dims);
+ y = createEmptyArray<float>(total_feat_dims);
+ score = createEmptyArray<float>(total_feat_dims);
+ ori = createEmptyArray<float>(total_feat_dims);
+ size = createEmptyArray<float>(total_feat_dims);
+ desc = createEmptyArray<float>(desc_dims);
+
+ float* x_ptr = x.get();
+ float* y_ptr = y.get();
+ float* score_ptr = score.get();
+ float* ori_ptr = ori.get();
+ float* size_ptr = size.get();
+ float* desc_ptr = desc.get();
+
+ unsigned offset = 0;
+ for (unsigned i = 0; i < n_octaves; i++) {
+ if (feat_pyr[i] == 0)
+ continue;
+
+ memcpy(x_ptr+offset, x_pyr[i], feat_pyr[i] * sizeof(float));
+ memcpy(y_ptr+offset, y_pyr[i], feat_pyr[i] * sizeof(float));
+ memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float));
+ memcpy(ori_ptr+offset, ori_pyr[i], feat_pyr[i] * sizeof(float));
+ memcpy(size_ptr+offset, size_pyr[i], feat_pyr[i] * sizeof(float));
+
+ memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float));
- return total_feat;
+ memFree(x_pyr[i]);
+ memFree(y_pyr[i]);
+ memFree(response_pyr[i]);
+ memFree(ori_pyr[i]);
+ memFree(size_pyr[i]);
+ memFree(desc_pyr[i]);
+
+ offset += feat_pyr[i];
+ }
}
+
+ return total_feat;
+}
+
}
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp
index 9f683fc..ba47ba9 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/sobel.cpp
@@ -91,6 +91,7 @@ template<typename Ti, typename To>
std::pair< Array<To>, Array<To> >
sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
{
+ img.eval();
// ket_size is for future proofing, this argument is not used
// currently
Array<To> dx = createEmptyArray<To>(img.dims());
diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp
index b279971..0243088 100644
--- a/src/backend/cpu/solve.cpp
+++ b/src/backend/cpu/solve.cpp
@@ -75,6 +75,10 @@ template<typename T>
Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
const Array<T> &b, const af_mat_prop options)
{
+ A.eval();
+ pivot.eval();
+ b.eval();
+
int N = A.dims()[0];
int NRHS = b.dims()[1];
Array< T > B = copyArray<T>(b);
@@ -114,9 +118,10 @@ Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop o
template<typename T>
Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
{
+ a.eval();
+ b.eval();
- if (options & AF_MAT_UPPER ||
- options & AF_MAT_LOWER) {
+ if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) {
return triangleSolve<T>(a, b, options);
}
@@ -178,6 +183,7 @@ Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
namespace cpu
{
+
#define INSTANTIATE_SOLVE(T) \
template Array<T> solve<T>(const Array<T> &a, const Array<T> &b, \
const af_mat_prop options); \
@@ -188,4 +194,5 @@ INSTANTIATE_SOLVE(float)
INSTANTIATE_SOLVE(cfloat)
INSTANTIATE_SOLVE(double)
INSTANTIATE_SOLVE(cdouble)
+
}
diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index 94d70a8..cbdb50e 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -25,65 +25,69 @@ using std::function;
namespace cpu
{
- ///////////////////////////////////////////////////////////////////////////
- // Kernel Functions
- ///////////////////////////////////////////////////////////////////////////
- // Based off of http://stackoverflow.com/a/12399290
- template<typename T, bool isAscending>
- void sort0(Array<T> val)
- {
- // initialize original index locations
- T *val_ptr = val.get();
+///////////////////////////////////////////////////////////////////////////
+// Kernel Functions
+///////////////////////////////////////////////////////////////////////////
- function<bool(T, T)> op = greater<T>();
- if(isAscending) { op = less<T>(); }
+// Based off of http://stackoverflow.com/a/12399290
+template<typename T, bool isAscending>
+void sort0(Array<T> val)
+{
+ // initialize original index locations
+ T *val_ptr = val.get();
+
+ function<bool(T, T)> op = greater<T>();
+ if(isAscending) { op = less<T>(); }
- T *comp_ptr = nullptr;
- for(dim_t w = 0; w < val.dims()[3]; w++) {
- dim_t valW = w * val.strides()[3];
- for(dim_t z = 0; z < val.dims()[2]; z++) {
- dim_t valWZ = valW + z * val.strides()[2];
- for(dim_t y = 0; y < val.dims()[1]; y++) {
+ T *comp_ptr = nullptr;
+ for(dim_t w = 0; w < val.dims()[3]; w++) {
+ dim_t valW = w * val.strides()[3];
+ for(dim_t z = 0; z < val.dims()[2]; z++) {
+ dim_t valWZ = valW + z * val.strides()[2];
+ for(dim_t y = 0; y < val.dims()[1]; y++) {
- dim_t valOffset = valWZ + y * val.strides()[1];
+ dim_t valOffset = valWZ + y * val.strides()[1];
- comp_ptr = val_ptr + valOffset;
- std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
- }
+ comp_ptr = val_ptr + valOffset;
+ std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
}
}
- return;
}
+ return;
+}
- ///////////////////////////////////////////////////////////////////////////
- // Wrapper Functions
- ///////////////////////////////////////////////////////////////////////////
- template<typename T, bool isAscending>
- Array<T> sort(const Array<T> &in, const unsigned dim)
- {
- Array<T> out = copyArray<T>(in);
- switch(dim) {
- case 0: getQueue().enqueue(sort0<T, isAscending>, out); break;
- default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
- }
- return out;
+///////////////////////////////////////////////////////////////////////////
+// Wrapper Functions
+///////////////////////////////////////////////////////////////////////////
+template<typename T, bool isAscending>
+Array<T> sort(const Array<T> &in, const unsigned dim)
+{
+ in.eval();
+
+ Array<T> out = copyArray<T>(in);
+ switch(dim) {
+ case 0: getQueue().enqueue(sort0<T, isAscending>, out); break;
+ default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
}
+ return out;
+}
#define INSTANTIATE(T) \
template Array<T> sort<T, true>(const Array<T> &in, const unsigned dim); \
template Array<T> sort<T,false>(const Array<T> &in, const unsigned dim); \
- INSTANTIATE(float)
- INSTANTIATE(double)
- //INSTANTIATE(cfloat)
- //INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(char)
- INSTANTIATE(uchar)
- INSTANTIATE(short)
- INSTANTIATE(ushort)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+//INSTANTIATE(cfloat)
+//INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
}
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index f07d585..f941534 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -23,68 +23,71 @@ using std::sort;
namespace cpu
{
- ///////////////////////////////////////////////////////////////////////////
- // Kernel Functions
- ///////////////////////////////////////////////////////////////////////////
- template<typename T, bool isAscending>
- void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
- {
- // initialize original index locations
- uint *idx_ptr = idx.get();
- T *val_ptr = val.get();
- const T *in_ptr = in.get();
- function<bool(T, T)> op = greater<T>();
- if(isAscending) { op = less<T>(); }
-
- std::vector<uint> seq_vec(idx.dims()[0]);
- std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
- const T *comp_ptr = nullptr;
- auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
- for(dim_t w = 0; w < in.dims()[3]; w++) {
- dim_t valW = w * val.strides()[3];
- dim_t idxW = w * idx.strides()[3];
- dim_t inW = w * in.strides()[3];
- for(dim_t z = 0; z < in.dims()[2]; z++) {
- dim_t valWZ = valW + z * val.strides()[2];
- dim_t idxWZ = idxW + z * idx.strides()[2];
- dim_t inWZ = inW + z * in.strides()[2];
- for(dim_t y = 0; y < in.dims()[1]; y++) {
-
- dim_t valOffset = valWZ + y * val.strides()[1];
- dim_t idxOffset = idxWZ + y * idx.strides()[1];
- dim_t inOffset = inWZ + y * in.strides()[1];
-
- uint *ptr = idx_ptr + idxOffset;
- std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
- comp_ptr = in_ptr + inOffset;
- std::stable_sort(ptr, ptr + in.dims()[0], comparator);
-
- for (dim_t i = 0; i < val.dims()[0]; ++i){
- val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
- }
+
+///////////////////////////////////////////////////////////////////////////
+// Kernel Functions
+///////////////////////////////////////////////////////////////////////////
+template<typename T, bool isAscending>
+void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
+{
+ // initialize original index locations
+ uint *idx_ptr = idx.get();
+ T *val_ptr = val.get();
+ const T *in_ptr = in.get();
+ function<bool(T, T)> op = greater<T>();
+ if(isAscending) { op = less<T>(); }
+
+ std::vector<uint> seq_vec(idx.dims()[0]);
+ std::iota(seq_vec.begin(), seq_vec.end(), 0);
+
+ const T *comp_ptr = nullptr;
+ auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
+
+ for(dim_t w = 0; w < in.dims()[3]; w++) {
+ dim_t valW = w * val.strides()[3];
+ dim_t idxW = w * idx.strides()[3];
+ dim_t inW = w * in.strides()[3];
+ for(dim_t z = 0; z < in.dims()[2]; z++) {
+ dim_t valWZ = valW + z * val.strides()[2];
+ dim_t idxWZ = idxW + z * idx.strides()[2];
+ dim_t inWZ = inW + z * in.strides()[2];
+ for(dim_t y = 0; y < in.dims()[1]; y++) {
+
+ dim_t valOffset = valWZ + y * val.strides()[1];
+ dim_t idxOffset = idxWZ + y * idx.strides()[1];
+ dim_t inOffset = inWZ + y * in.strides()[1];
+
+ uint *ptr = idx_ptr + idxOffset;
+ std::copy(seq_vec.begin(), seq_vec.end(), ptr);
+
+ comp_ptr = in_ptr + inOffset;
+ std::stable_sort(ptr, ptr + in.dims()[0], comparator);
+
+ for (dim_t i = 0; i < val.dims()[0]; ++i){
+ val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
}
}
}
-
- return;
}
- ///////////////////////////////////////////////////////////////////////////
- // Wrapper Functions
- ///////////////////////////////////////////////////////////////////////////
- template<typename T, bool isAscending>
- void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
- {
- val = createEmptyArray<T>(in.dims());
- idx = createEmptyArray<uint>(in.dims());
- switch(dim) {
- case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
- default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
- }
+ return;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Wrapper Functions
+///////////////////////////////////////////////////////////////////////////
+template<typename T, bool isAscending>
+void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
+{
+ in.eval();
+
+ val = createEmptyArray<T>(in.dims());
+ idx = createEmptyArray<uint>(in.dims());
+ switch(dim) {
+ case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
+ default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
}
+}
#define INSTANTIATE(T) \
template void sort_index<T, true>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
@@ -92,16 +95,17 @@ namespace cpu
template void sort_index<T,false>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
const uint dim); \
- INSTANTIATE(float)
- INSTANTIATE(double)
- //INSTANTIATE(cfloat)
- //INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(char)
- INSTANTIATE(uchar)
- INSTANTIATE(short)
- INSTANTIATE(ushort)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
+INSTANTIATE(float)
+INSTANTIATE(double)
+//INSTANTIATE(cfloat)
+//INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+
}
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index e2c908c..c278908 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -106,6 +106,8 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
const unsigned radius, const float diff_thr, const float geom_thr,
const float feature_ratio, const unsigned edge)
{
+ in.eval();
+
dim4 idims = in.dims();
const unsigned corner_lim = in.elements() * feature_ratio;
diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp
index 39cbb66..92912ca 100644
--- a/src/backend/cpu/svd.cpp
+++ b/src/backend/cpu/svd.cpp
@@ -30,101 +30,106 @@ namespace cpu
#if defined(USE_MKL) || defined(__APPLE__)
- template<typename T, typename Tr>
- using svd_func_def = int (*)(ORDER_TYPE,
- char jobz,
- int m, int n,
- T* in, int ldin,
- Tr* s,
- T* u, int ldu,
- T* vt, int ldvt);
-
- SVD_FUNC_DEF( gesdd )
- SVD_FUNC(gesdd, float , float , s)
- SVD_FUNC(gesdd, double , double, d)
- SVD_FUNC(gesdd, cfloat , float , c)
- SVD_FUNC(gesdd, cdouble, double, z)
+template<typename T, typename Tr>
+using svd_func_def = int (*)(ORDER_TYPE,
+ char jobz,
+ int m, int n,
+ T* in, int ldin,
+ Tr* s,
+ T* u, int ldu,
+ T* vt, int ldvt);
+
+SVD_FUNC_DEF( gesdd )
+SVD_FUNC(gesdd, float , float , s)
+SVD_FUNC(gesdd, double , double, d)
+SVD_FUNC(gesdd, cfloat , float , c)
+SVD_FUNC(gesdd, cdouble, double, z)
#else // Atlas causes memory freeing issues with using gesdd
- template<typename T, typename Tr>
- using svd_func_def = int (*)(ORDER_TYPE,
- char jobu, char jobvt,
- int m, int n,
- T* in, int ldin,
- Tr* s,
- T* u, int ldu,
- T* vt, int ldvt,
- Tr *superb);
-
- SVD_FUNC_DEF( gesvd )
- SVD_FUNC(gesvd, float , float , s)
- SVD_FUNC(gesvd, double , double, d)
- SVD_FUNC(gesvd, cfloat , float , c)
- SVD_FUNC(gesvd, cdouble, double, z)
+template<typename T, typename Tr>
+using svd_func_def = int (*)(ORDER_TYPE,
+ char jobu, char jobvt,
+ int m, int n,
+ T* in, int ldin,
+ Tr* s,
+ T* u, int ldu,
+ T* vt, int ldvt,
+ Tr *superb);
+
+SVD_FUNC_DEF( gesvd )
+SVD_FUNC(gesvd, float , float , s)
+SVD_FUNC(gesvd, double , double, d)
+SVD_FUNC(gesvd, cfloat , float , c)
+SVD_FUNC(gesvd, cdouble, double, z)
#endif
- template <typename T, typename Tr>
- void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
- {
- s.eval();
- u.eval();
- vt.eval();
- in.eval();
+template <typename T, typename Tr>
+void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+{
+ s.eval();
+ u.eval();
+ vt.eval();
+ in.eval();
- auto func = [=] (Array<Tr> s, Array<T> u, Array<T> vt, Array<T> in) {
- dim4 iDims = in.dims();
- int M = iDims[0];
- int N = iDims[1];
+ auto func = [=] (Array<Tr> s, Array<T> u, Array<T> vt, Array<T> in) {
+ dim4 iDims = in.dims();
+ int M = iDims[0];
+ int N = iDims[1];
#if defined(USE_MKL) || defined(__APPLE__)
- svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1],
- s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]);
+ svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1],
+ s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]);
#else
- std::vector<Tr> superb(std::min(M, N));
- svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1],
- s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]);
+ std::vector<Tr> superb(std::min(M, N));
+ svd_func<T, Tr>()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1],
+ s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]);
#endif
- };
- getQueue().enqueue(func, s, u, vt, in);
- }
-
- template <typename T, typename Tr>
- void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
- {
- Array<T> in_copy = copyArray<T>(in);
- svdInPlace(s, u, vt, in_copy);
- }
+ };
+ getQueue().enqueue(func, s, u, vt, in);
+}
+
+template <typename T, typename Tr>
+void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+{
+ Array<T> in_copy = copyArray<T>(in);
+ svdInPlace(s, u, vt, in_copy);
+}
+
}
#else
namespace cpu
{
- template <typename T, typename Tr>
- void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
- {
- AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
- }
-
- template <typename T, typename Tr>
- void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
- {
- AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
- }
+
+template <typename T, typename Tr>
+void svd(Array<Tr> &s, Array<T> &u, Array<T> &vt, const Array<T> &in)
+{
+ AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
+}
+
+template <typename T, typename Tr>
+void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
+{
+ AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED);
+}
+
}
#endif
-namespace cpu {
+namespace cpu
+{
#define INSTANTIATE_SVD(T, Tr) \
template void svd<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, const Array<T> &in); \
template void svdInPlace<T, Tr>(Array<Tr> & s, Array<T> & u, Array<T> & vt, Array<T> &in);
- INSTANTIATE_SVD(float , float )
- INSTANTIATE_SVD(double , double)
- INSTANTIATE_SVD(cfloat , float )
- INSTANTIATE_SVD(cdouble, double)
+INSTANTIATE_SVD(float , float )
+INSTANTIATE_SVD(double , double)
+INSTANTIATE_SVD(cfloat , float )
+INSTANTIATE_SVD(cdouble, double)
+
}
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp
index f4a0514..a7287ce 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/transform.cpp
@@ -107,8 +107,10 @@ template<typename T>
Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
const af_interp_type method, const bool inverse)
{
- Array<T> out = createEmptyArray<T>(odims);
in.eval();
+ transform.eval();
+
+ Array<T> out = createEmptyArray<T>(odims);
switch(method) {
case AF_INTERP_NEAREST :
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp
index c3a8a37..7e7eec1 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/transpose.cpp
@@ -171,5 +171,4 @@ INSTANTIATE(uintl )
INSTANTIATE(short)
INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp
index ed7f348..13bee16 100644
--- a/src/backend/cpu/triangle.cpp
+++ b/src/backend/cpu/triangle.cpp
@@ -66,6 +66,7 @@ void triangle(Array<T> &out, const Array<T> &in)
template<typename T, bool is_upper, bool is_unit_diag>
Array<T> triangle(const Array<T> &in)
{
+ in.eval();
Array<T> out = createEmptyArray<T>(in.dims());
triangle<T, is_upper, is_unit_diag>(out, in);
return out;
@@ -81,17 +82,17 @@ Array<T> triangle(const Array<T> &in)
template Array<T> triangle<T, true , false>(const Array<T> &in); \
template Array<T> triangle<T, false, false>(const Array<T> &in); \
- INSTANTIATE(float)
- INSTANTIATE(double)
- INSTANTIATE(cfloat)
- INSTANTIATE(cdouble)
- INSTANTIATE(int)
- INSTANTIATE(uint)
- INSTANTIATE(intl)
- INSTANTIATE(uintl)
- INSTANTIATE(char)
- INSTANTIATE(uchar)
- INSTANTIATE(short)
- INSTANTIATE(ushort)
+INSTANTIATE(float)
+INSTANTIATE(double)
+INSTANTIATE(cfloat)
+INSTANTIATE(cdouble)
+INSTANTIATE(int)
+INSTANTIATE(uint)
+INSTANTIATE(intl)
+INSTANTIATE(uintl)
+INSTANTIATE(char)
+INSTANTIATE(uchar)
+INSTANTIATE(short)
+INSTANTIATE(ushort)
}
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index efb46be..41423c7 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -83,8 +83,9 @@ template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
{
- af::dim4 idims = in.dims();
+ in.eval();
+ af::dim4 idims = in.dims();
dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
dim_t ny = (idims[1] + 2 * py - wy) / sy + 1;
diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp
index e6a4817..441c7ff 100644
--- a/src/backend/cpu/where.cpp
+++ b/src/backend/cpu/where.cpp
@@ -23,61 +23,62 @@ using af::dim4;
namespace cpu
{
- template<typename T>
- Array<uint> where(const Array<T> &in)
- {
- evalArray(in);
- getQueue().sync();
- const dim_t *dims = in.dims().get();
- const dim_t *strides = in.strides().get();
- static const T zero = scalar<T>(0);
+template<typename T>
+Array<uint> where(const Array<T> &in)
+{
+ evalArray(in);
+ getQueue().sync();
+
+ const dim_t *dims = in.dims().get();
+ const dim_t *strides = in.strides().get();
+ static const T zero = scalar<T>(0);
- const T *iptr = in.get();
- uint *out_vec = memAlloc<uint>(in.elements());
+ const T *iptr = in.get();
+ uint *out_vec = memAlloc<uint>(in.elements());
- dim_t count = 0;
- dim_t idx = 0;
- for (dim_t w = 0; w < dims[3]; w++) {
- uint offw = w * strides[3];
+ dim_t count = 0;
+ dim_t idx = 0;
+ for (dim_t w = 0; w < dims[3]; w++) {
+ uint offw = w * strides[3];
- for (dim_t z = 0; z < dims[2]; z++) {
- uint offz = offw + z * strides[2];
+ for (dim_t z = 0; z < dims[2]; z++) {
+ uint offz = offw + z * strides[2];
- for (dim_t y = 0; y < dims[1]; y++) {
- uint offy = y * strides[1] + offz;
+ for (dim_t y = 0; y < dims[1]; y++) {
+ uint offy = y * strides[1] + offz;
- for (dim_t x = 0; x < dims[0]; x++) {
+ for (dim_t x = 0; x < dims[0]; x++) {
- T val = iptr[offy + x];
- if (val != zero) {
- out_vec[count] = idx;
- count++;
- }
- idx++;
+ T val = iptr[offy + x];
+ if (val != zero) {
+ out_vec[count] = idx;
+ count++;
}
+ idx++;
}
}
}
-
- Array<uint> out = createDeviceDataArray<uint>(dim4(count), out_vec);
- return out;
}
+ Array<uint> out = createDeviceDataArray<uint>(dim4(count), out_vec);
+ return out;
+}
+
#define INSTANTIATE(T) \
template Array<uint> where<T>(const Array<T> &in); \
- INSTANTIATE(float )
- INSTANTIATE(cfloat )
- INSTANTIATE(double )
- INSTANTIATE(cdouble)
- INSTANTIATE(char )
- INSTANTIATE(int )
- INSTANTIATE(uint )
- INSTANTIATE(intl )
- INSTANTIATE(uintl )
- INSTANTIATE(uchar )
- INSTANTIATE(short )
- INSTANTIATE(ushort )
+INSTANTIATE(float )
+INSTANTIATE(cfloat )
+INSTANTIATE(double )
+INSTANTIATE(cdouble)
+INSTANTIATE(char )
+INSTANTIATE(int )
+INSTANTIATE(uint )
+INSTANTIATE(intl )
+INSTANTIATE(uintl )
+INSTANTIATE(uchar )
+INSTANTIATE(short )
+INSTANTIATE(ushort )
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list