[arrayfire] 79/284: Fixes for asynchronous cpu copy && set functions
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Sun Feb 7 18:59:21 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.
commit 7dac34a56f32bf6ee5431cffe5266cc671144f70
Author: pradeep <pradeep at arrayfire.com>
Date: Wed Dec 16 19:29:14 2015 -0500
Fixes for asynchronous cpu copy && set functions
Also, added a check in Array::eval to throw exception if Array::eval
is being called from a queue thread. This change also includes all
the regression fixes for other functions regarding this eval change.
---
src/backend/cpu/Array.cpp | 2 +
src/backend/cpu/approx.cpp | 7 +
src/backend/cpu/assign.cpp | 4 +
src/backend/cpu/blas.cpp | 6 +
src/backend/cpu/copy.cpp | 12 +-
src/backend/cpu/diagonal.cpp | 9 +-
src/backend/cpu/index.cpp | 3 +
src/backend/cpu/ireduce.cpp | 54 +++---
src/backend/cpu/morph.cpp | 6 +
src/backend/cpu/reduce.cpp | 381 ++++++++++++++++++++--------------------
src/backend/cpu/reorder.cpp | 2 +
src/backend/cpu/set.cpp | 5 +-
src/backend/cpu/sort_by_key.cpp | 173 +++++++++---------
src/backend/cpu/svd.cpp | 5 +
src/backend/cpu/tile.cpp | 2 +
src/backend/cpu/transpose.cpp | 47 +++--
16 files changed, 392 insertions(+), 326 deletions(-)
diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp
index 8577374..456f4c8 100644
--- a/src/backend/cpu/Array.cpp
+++ b/src/backend/cpu/Array.cpp
@@ -8,6 +8,7 @@
********************************************************/
#include <af/dim4.hpp>
+#include <err_common.hpp>
#include <Array.hpp>
#include <copy.hpp>
#include <TNJ/BufferNode.hpp>
@@ -69,6 +70,7 @@ namespace cpu
void Array<T>::eval()
{
if (isReady()) return;
+ if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL);
this->setId(getActiveDeviceId());
diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp
index 87ae56f..4d3c880 100644
--- a/src/backend/cpu/approx.cpp
+++ b/src/backend/cpu/approx.cpp
@@ -136,6 +136,9 @@ namespace cpu
Array<Ty> approx1(const Array<Ty> &in, const Array<Tp> &pos,
const af_interp_type method, const float offGrid)
{
+ in.eval();
+ pos.eval();
+
af::dim4 odims = in.dims();
odims[0] = pos.dims()[0];
@@ -305,6 +308,10 @@ namespace cpu
Array<Ty> approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1,
const af_interp_type method, const float offGrid)
{
+ in.eval();
+ pos0.eval();
+ pos1.eval();
+
af::dim4 odims = in.dims();
odims[0] = pos0.dims()[0];
odims[1] = pos0.dims()[1];
diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp
index b1578d4..c5d733b 100644
--- a/src/backend/cpu/assign.cpp
+++ b/src/backend/cpu/assign.cpp
@@ -41,6 +41,9 @@ dim_t trimIndex(int idx, const dim_t &len)
template<typename T>
void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs)
{
+ out.eval();
+ rhs.eval();
+
vector<bool> isSeq(4);
vector<af_seq> seqs(4, af_span);
// create seq vector to retrieve output dimensions, offsets & offsets
@@ -56,6 +59,7 @@ void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs)
for (dim_t x=0; x<4; ++x) {
if (!isSeq[x]) {
idxArrs[x] = castArray<uint>(idxrs[x].idx.arr);
+ idxArrs[x].eval();
}
}
diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp
index 3326241..26ec8b4 100644
--- a/src/backend/cpu/blas.cpp
+++ b/src/backend/cpu/blas.cpp
@@ -147,6 +147,9 @@ template<typename T>
Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
af_mat_prop optLhs, af_mat_prop optRhs)
{
+ lhs.eval();
+ rhs.eval();
+
CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs);
CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs);
@@ -225,6 +228,9 @@ template<typename T>
Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
af_mat_prop optLhs, af_mat_prop optRhs)
{
+ lhs.eval();
+ rhs.eval();
+
Array<T> out = createEmptyArray<T>(af::dim4(1));
if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) {
getQueue().enqueue(dot_<T, false, true>, out, lhs, rhs, optLhs, optRhs);
diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp
index 80f28da..5240360 100644
--- a/src/backend/cpu/copy.cpp
+++ b/src/backend/cpu/copy.cpp
@@ -48,7 +48,7 @@ namespace cpu
template<typename T>
void copyData(T *to, const Array<T> &from)
{
- evalArray(from);
+ from.eval();
getQueue().sync();
if(from.isOwner()) {
// FIXME: Check for errors / exceptions
@@ -118,16 +118,18 @@ namespace cpu
template<typename T>
void multiply_inplace(Array<T> &in, double val)
{
+ in.eval();
getQueue().enqueue(copy<T, T>, in, in, 0, val);
}
template<typename inType, typename outType>
- Array<outType>
- padArray(Array<inType> const &in, dim4 const &dims,
- outType default_value, double factor)
+ Array<outType> padArray(Array<inType> const &in, dim4 const &dims,
+ outType default_value, double factor)
{
Array<outType> ret = createValueArray<outType>(dims, default_value);
ret.eval();
+ in.eval();
+ // FIXME:
getQueue().sync();
getQueue().enqueue(copy<inType, outType>, ret, in, outType(default_value), factor);
return ret;
@@ -136,6 +138,8 @@ namespace cpu
template<typename inType, typename outType>
void copyArray(Array<outType> &out, Array<inType> const &in)
{
+ out.eval();
+ in.eval();
getQueue().enqueue(copy<inType, outType>, out, in, scalar<outType>(0), 1.0);
}
diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp
index 182027d..856ed6e 100644
--- a/src/backend/cpu/diagonal.cpp
+++ b/src/backend/cpu/diagonal.cpp
@@ -10,6 +10,7 @@
#include <af/array.h>
#include <af/dim4.hpp>
#include <af/defines.h>
+#include <handle.hpp>
#include <Array.hpp>
#include <diagonal.hpp>
#include <math.hpp>
@@ -22,6 +23,8 @@ namespace cpu
template<typename T>
Array<T> diagCreate(const Array<T> &in, const int num)
{
+ in.eval();
+
int size = in.dims()[0] + std::abs(num);
int batch = in.dims()[1];
Array<T> out = createEmptyArray<T>(dim4(size, size, batch));
@@ -52,12 +55,14 @@ namespace cpu
template<typename T>
Array<T> diagExtract(const Array<T> &in, const int num)
{
- const dim_t *idims = in.dims().get();
+ in.eval();
+
+ const dim4 idims = in.dims();
dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));
auto func = [=] (Array<T> out, const Array<T> in) {
- const dim_t *odims = out.dims().get();
+ const dim4 odims = out.dims();
const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num);
diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp
index c1beeea..68c2f16 100644
--- a/src/backend/cpu/index.cpp
+++ b/src/backend/cpu/index.cpp
@@ -41,6 +41,8 @@ dim_t trimIndex(dim_t idx, const dim_t &len)
template<typename T>
Array<T> index(const Array<T>& in, const af_index_t idxrs[])
{
+ in.eval();
+
vector<bool> isSeq(4);
vector<af_seq> seqs(4, af_span);
// create seq vector to retrieve output
@@ -60,6 +62,7 @@ Array<T> index(const Array<T>& in, const af_index_t idxrs[])
for (dim_t x=0; x<isSeq.size(); ++x) {
if (!isSeq[x]) {
idxArrs[x] = castArray<uint>(idxrs[x].idx.arr);
+ idxArrs[x].eval();
// set output array ith dimension value
oDims[x] = idxArrs[x].elements();
}
diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp
index 7f4b03c..e562bae 100644
--- a/src/backend/cpu/ireduce.cpp
+++ b/src/backend/cpu/ireduce.cpp
@@ -71,19 +71,16 @@ namespace cpu
template<af_op_t op, typename T, int D>
struct ireduce_dim
{
- void operator()(T *out, const dim4 ostrides, const dim4 odims,
- uint *loc,
- const T *in , const dim4 istrides, const dim4 idims,
- const int dim)
+ void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+ const Array<T> input, const dim_t inOffset, const int dim)
{
+ const dim4 odims = output.dims();
+ const dim4 ostrides = output.strides();
+ const dim4 istrides = input.strides();
const int D1 = D - 1;
for (dim_t i = 0; i < odims[D1]; i++) {
- ireduce_dim<op, T, D1>()(out + i * ostrides[D1],
- ostrides, odims,
- loc + i * ostrides[D1],
- in + i * istrides[D1],
- istrides, idims,
- dim);
+ ireduce_dim<op, T, D1>()(output, locArray, outOffset + i * ostrides[D1],
+ input, inOffset + i * istrides[D1], dim);
}
}
};
@@ -91,33 +88,38 @@ namespace cpu
template<af_op_t op, typename T>
struct ireduce_dim<op, T, 0>
{
- void operator()(T *out, const dim4 ostrides, const dim4 odims,
- uint *loc,
- const T *in , const dim4 istrides, const dim4 idims,
- const int dim)
+ void operator()(Array<T> output, Array<uint> locArray, const dim_t outOffset,
+ const Array<T> input, const dim_t inOffset, const int dim)
{
+ const dim4 idims = input.dims();
+ const dim4 istrides = input.strides();
+
+ T const * const in = input.get();
+ T * out = output.get();
+ uint * loc = locArray.get();
dim_t stride = istrides[dim];
MinMaxOp<op, T> Op(in[0], 0);
for (dim_t i = 0; i < idims[dim]; i++) {
- Op(in[i * stride], i);
+ Op(in[inOffset + i * stride], i);
}
- *out = Op.m_val;
- *loc = Op.m_idx;
+ *(out+outOffset) = Op.m_val;
+ *(loc+outOffset) = Op.m_idx;
}
};
template<af_op_t op, typename T>
- using ireduce_dim_func = std::function<void(T *out, const dim4 ostrides, const dim4 odims,
- uint *loc,
- const T *in , const dim4 istrides, const dim4 idims,
- const int dim)>;
+ using ireduce_dim_func = std::function<void(Array<T>, Array<uint>, const dim_t,
+ const Array<T>, const dim_t, const int)>;
template<af_op_t op, typename T>
- void ireduce(Array<T> &out, Array<uint> &loc,
- const Array<T> &in, const int dim)
+ void ireduce(Array<T> &out, Array<uint> &loc, const Array<T> &in, const int dim)
{
+ out.eval();
+ loc.eval();
+ in.eval();
+
dim4 odims = in.dims();
odims[dim] = 1;
static const ireduce_dim_func<op, T> ireduce_funcs[] = { ireduce_dim<op, T, 1>()
@@ -125,15 +127,15 @@ namespace cpu
, ireduce_dim<op, T, 3>()
, ireduce_dim<op, T, 4>()};
- getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out.get(), out.strides(), out.dims(),
- loc.get(), in.get(), in.strides(), in.dims(), dim);
+ getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim);
}
template<af_op_t op, typename T>
T ireduce_all(unsigned *loc, const Array<T> &in)
{
- evalArray(in);
+ in.eval();
getQueue().sync();
+
af::dim4 dims = in.dims();
af::dim4 strides = in.strides();
const T *inPtr = in.get();
diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp
index c64d09b..945c32b 100644
--- a/src/backend/cpu/morph.cpp
+++ b/src/backend/cpu/morph.cpp
@@ -33,6 +33,9 @@ static inline unsigned getIdx(const dim4 &strides,
template<typename T, bool isDilation>
Array<T> morph(const Array<T> &in, const Array<T> &mask)
{
+ in.eval();
+ mask.eval();
+
Array<T> out = createEmptyArray<T>(in.dims());
auto func = [=] (Array<T> out, const Array<T> in, const Array<T> mask) {
@@ -96,6 +99,9 @@ Array<T> morph(const Array<T> &in, const Array<T> &mask)
template<typename T, bool isDilation>
Array<T> morph3d(const Array<T> &in, const Array<T> &mask)
{
+ in.eval();
+ mask.eval();
+
Array<T> out = createEmptyArray<T>(in.dims());
auto func = [=] (Array<T> out, const Array<T> in, const Array<T> mask) {
diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp
index e01f0c5..cce1226 100644
--- a/src/backend/cpu/reduce.cpp
+++ b/src/backend/cpu/reduce.cpp
@@ -37,220 +37,229 @@ struct Binary<cdouble, af_add_t>
namespace cpu
{
- template<af_op_t op, typename Ti, typename To, int D>
- struct reduce_dim
- {
- void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
- const Ti *in , const dim4 &istrides, const dim4 &idims,
- const int dim, bool change_nan, double nanval)
- {
- static const int D1 = D - 1;
- static reduce_dim<op, Ti, To, D1> reduce_dim_next;
- for (dim_t i = 0; i < odims[D1]; i++) {
- reduce_dim_next(out + i * ostrides[D1],
- ostrides, odims,
- in + i * istrides[D1],
- istrides, idims,
- dim, change_nan, nanval);
- }
- }
- };
- template<af_op_t op, typename Ti, typename To>
- struct reduce_dim<op, Ti, To, 0>
+template<af_op_t op, typename Ti, typename To, int D>
+struct reduce_dim
+{
+ void operator()(Array<To> out, const dim_t outOffset,
+ const Array<Ti> in, const dim_t inOffset,
+ const int dim, bool change_nan, double nanval)
{
+ static const int D1 = D - 1;
+ static reduce_dim<op, Ti, To, D1> reduce_dim_next;
- Transform<Ti, To, op> transform;
- Binary<To, op> reduce;
- void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
- const Ti *in , const dim4 &istrides, const dim4 &idims,
- const int dim, bool change_nan, double nanval)
- {
- dim_t stride = istrides[dim];
-
- To out_val = reduce.init();
- for (dim_t i = 0; i < idims[dim]; i++) {
- To in_val = transform(in[i * stride]);
- if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
- out_val = reduce(in_val, out_val);
- }
+ const dim4 ostrides = out.strides();
+ const dim4 istrides = in.strides();
+ const dim4 odims = out.dims();
- *out = out_val;
+ for (dim_t i = 0; i < odims[D1]; i++) {
+ reduce_dim_next(out, outOffset + i * ostrides[D1],
+ in, inOffset + i * istrides[D1],
+ dim, change_nan, nanval);
}
- };
+ }
+};
- template<af_op_t op, typename Ti, typename To>
- using reduce_dim_func = std::function<void(To*,const dim4&, const dim4&,
- const Ti*, const dim4&, const dim4&,
- const int, bool, double)>;
+template<af_op_t op, typename Ti, typename To>
+struct reduce_dim<op, Ti, To, 0>
+{
- template<af_op_t op, typename Ti, typename To>
- Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
+ Transform<Ti, To, op> transform;
+ Binary<To, op> reduce;
+ void operator()(Array<To> out, const dim_t outOffset,
+ const Array<Ti> in, const dim_t inOffset,
+ const int dim, bool change_nan, double nanval)
{
- dim4 odims = in.dims();
- odims[dim] = 1;
- in.eval();
+ const dim4 istrides = in.strides();
+ const dim4 idims = in.dims();
+
+ To * const outPtr = out.get() + outOffset;
+ Ti const * const inPtr = in.get() + inOffset;
+ dim_t stride = istrides[dim];
+
+ To out_val = reduce.init();
+ for (dim_t i = 0; i < idims[dim]; i++) {
+ To in_val = transform(inPtr[i * stride]);
+ if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+ out_val = reduce(in_val, out_val);
+ }
- Array<To> out = createEmptyArray<To>(odims);
- static const reduce_dim_func<op, Ti, To> reduce_funcs[4] = { reduce_dim<op, Ti, To, 1>()
- , reduce_dim<op, Ti, To, 2>()
- , reduce_dim<op, Ti, To, 3>()
- , reduce_dim<op, Ti, To, 4>()};
+ *outPtr = out_val;
+ }
+};
- getQueue().enqueue(reduce_funcs[in.ndims() - 1],out.get(), out.strides(), out.dims(),
- in.get(), in.strides(), in.dims(), dim,
- change_nan, nanval);
+template<af_op_t op, typename Ti, typename To>
+using reduce_dim_func = std::function<void(Array<To>, const dim_t,
+ const Array<Ti>, const dim_t,
+ const int, bool, double)>;
- return out;
- }
+template<af_op_t op, typename Ti, typename To>
+Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
+{
+ dim4 odims = in.dims();
+ odims[dim] = 1;
+ in.eval();
- template<af_op_t op, typename Ti, typename To>
- To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
- {
- evalArray(in);
- getQueue().sync();
- Transform<Ti, To, op> transform;
- Binary<To, op> reduce;
+ Array<To> out = createEmptyArray<To>(odims);
+ static const reduce_dim_func<op, Ti, To> reduce_funcs[4] = { reduce_dim<op, Ti, To, 1>()
+ , reduce_dim<op, Ti, To, 2>()
+ , reduce_dim<op, Ti, To, 3>()
+ , reduce_dim<op, Ti, To, 4>()};
+
+ getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval);
+
+ return out;
+}
+
+template<af_op_t op, typename Ti, typename To>
+To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
+{
+ in.eval();
+ getQueue().sync();
+
+ Transform<Ti, To, op> transform;
+ Binary<To, op> reduce;
- To out = reduce.init();
+ To out = reduce.init();
- // Decrement dimension of select dimension
- af::dim4 dims = in.dims();
- af::dim4 strides = in.strides();
- const Ti *inPtr = in.get();
+ // Decrement dimension of select dimension
+ af::dim4 dims = in.dims();
+ af::dim4 strides = in.strides();
+ const Ti *inPtr = in.get();
- for(dim_t l = 0; l < dims[3]; l++) {
- dim_t off3 = l * strides[3];
+ for(dim_t l = 0; l < dims[3]; l++) {
+ dim_t off3 = l * strides[3];
- for(dim_t k = 0; k < dims[2]; k++) {
- dim_t off2 = k * strides[2];
+ for(dim_t k = 0; k < dims[2]; k++) {
+ dim_t off2 = k * strides[2];
- for(dim_t j = 0; j < dims[1]; j++) {
- dim_t off1 = j * strides[1];
+ for(dim_t j = 0; j < dims[1]; j++) {
+ dim_t off1 = j * strides[1];
- for(dim_t i = 0; i < dims[0]; i++) {
- dim_t idx = i + off1 + off2 + off3;
+ for(dim_t i = 0; i < dims[0]; i++) {
+ dim_t idx = i + off1 + off2 + off3;
- To in_val = transform(inPtr[idx]);
- if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
- out = reduce(in_val, out);
- }
+ To in_val = transform(inPtr[idx]);
+ if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+ out = reduce(in_val, out);
}
}
}
-
- return out;
}
+ return out;
+}
+
#define INSTANTIATE(ROp, Ti, To) \
template Array<To> reduce<ROp, Ti, To>(const Array<Ti> &in, const int dim, \
bool change_nan, double nanval); \
template To reduce_all<ROp, Ti, To>(const Array<Ti> &in, \
bool change_nan, double nanval);
- //min
- INSTANTIATE(af_min_t, float , float )
- INSTANTIATE(af_min_t, double , double )
- INSTANTIATE(af_min_t, cfloat , cfloat )
- INSTANTIATE(af_min_t, cdouble, cdouble)
- INSTANTIATE(af_min_t, int , int )
- INSTANTIATE(af_min_t, uint , uint )
- INSTANTIATE(af_min_t, intl , intl )
- INSTANTIATE(af_min_t, uintl , uintl )
- INSTANTIATE(af_min_t, char , char )
- INSTANTIATE(af_min_t, uchar , uchar )
- INSTANTIATE(af_min_t, short , short )
- INSTANTIATE(af_min_t, ushort , ushort )
-
- //max
- INSTANTIATE(af_max_t, float , float )
- INSTANTIATE(af_max_t, double , double )
- INSTANTIATE(af_max_t, cfloat , cfloat )
- INSTANTIATE(af_max_t, cdouble, cdouble)
- INSTANTIATE(af_max_t, int , int )
- INSTANTIATE(af_max_t, uint , uint )
- INSTANTIATE(af_max_t, intl , intl )
- INSTANTIATE(af_max_t, uintl , uintl )
- INSTANTIATE(af_max_t, char , char )
- INSTANTIATE(af_max_t, uchar , uchar )
- INSTANTIATE(af_max_t, short , short )
- INSTANTIATE(af_max_t, ushort , ushort )
-
- //sum
- INSTANTIATE(af_add_t, float , float )
- INSTANTIATE(af_add_t, double , double )
- INSTANTIATE(af_add_t, cfloat , cfloat )
- INSTANTIATE(af_add_t, cdouble, cdouble)
- INSTANTIATE(af_add_t, int , int )
- INSTANTIATE(af_add_t, int , float )
- INSTANTIATE(af_add_t, uint , uint )
- INSTANTIATE(af_add_t, uint , float )
- INSTANTIATE(af_add_t, intl , intl )
- INSTANTIATE(af_add_t, intl , double )
- INSTANTIATE(af_add_t, uintl , uintl )
- INSTANTIATE(af_add_t, uintl , double )
- INSTANTIATE(af_add_t, char , int )
- INSTANTIATE(af_add_t, char , float )
- INSTANTIATE(af_add_t, uchar , uint )
- INSTANTIATE(af_add_t, uchar , float )
- INSTANTIATE(af_add_t, short , int )
- INSTANTIATE(af_add_t, short , float )
- INSTANTIATE(af_add_t, ushort , uint )
- INSTANTIATE(af_add_t, ushort , float )
-
- //mul
- INSTANTIATE(af_mul_t, float , float )
- INSTANTIATE(af_mul_t, double , double )
- INSTANTIATE(af_mul_t, cfloat , cfloat )
- INSTANTIATE(af_mul_t, cdouble, cdouble)
- INSTANTIATE(af_mul_t, int , int )
- INSTANTIATE(af_mul_t, uint , uint )
- INSTANTIATE(af_mul_t, intl , intl )
- INSTANTIATE(af_mul_t, uintl , uintl )
- INSTANTIATE(af_mul_t, char , int )
- INSTANTIATE(af_mul_t, uchar , uint )
- INSTANTIATE(af_mul_t, short , int )
- INSTANTIATE(af_mul_t, ushort , uint )
-
- // count
- INSTANTIATE(af_notzero_t, float , uint)
- INSTANTIATE(af_notzero_t, double , uint)
- INSTANTIATE(af_notzero_t, cfloat , uint)
- INSTANTIATE(af_notzero_t, cdouble, uint)
- INSTANTIATE(af_notzero_t, int , uint)
- INSTANTIATE(af_notzero_t, uint , uint)
- INSTANTIATE(af_notzero_t, intl , uint)
- INSTANTIATE(af_notzero_t, uintl , uint)
- INSTANTIATE(af_notzero_t, char , uint)
- INSTANTIATE(af_notzero_t, uchar , uint)
- INSTANTIATE(af_notzero_t, short , uint)
- INSTANTIATE(af_notzero_t, ushort , uint)
-
- //anytrue
- INSTANTIATE(af_or_t, float , char)
- INSTANTIATE(af_or_t, double , char)
- INSTANTIATE(af_or_t, cfloat , char)
- INSTANTIATE(af_or_t, cdouble, char)
- INSTANTIATE(af_or_t, int , char)
- INSTANTIATE(af_or_t, uint , char)
- INSTANTIATE(af_or_t, intl , char)
- INSTANTIATE(af_or_t, uintl , char)
- INSTANTIATE(af_or_t, char , char)
- INSTANTIATE(af_or_t, uchar , char)
- INSTANTIATE(af_or_t, short , char)
- INSTANTIATE(af_or_t, ushort , char)
-
- //alltrue
- INSTANTIATE(af_and_t, float , char)
- INSTANTIATE(af_and_t, double , char)
- INSTANTIATE(af_and_t, cfloat , char)
- INSTANTIATE(af_and_t, cdouble, char)
- INSTANTIATE(af_and_t, int , char)
- INSTANTIATE(af_and_t, uint , char)
- INSTANTIATE(af_and_t, intl , char)
- INSTANTIATE(af_and_t, uintl , char)
- INSTANTIATE(af_and_t, char , char)
- INSTANTIATE(af_and_t, uchar , char)
- INSTANTIATE(af_and_t, short , char)
- INSTANTIATE(af_and_t, ushort , char)
+//min
+INSTANTIATE(af_min_t, float , float )
+INSTANTIATE(af_min_t, double , double )
+INSTANTIATE(af_min_t, cfloat , cfloat )
+INSTANTIATE(af_min_t, cdouble, cdouble)
+INSTANTIATE(af_min_t, int , int )
+INSTANTIATE(af_min_t, uint , uint )
+INSTANTIATE(af_min_t, intl , intl )
+INSTANTIATE(af_min_t, uintl , uintl )
+INSTANTIATE(af_min_t, char , char )
+INSTANTIATE(af_min_t, uchar , uchar )
+INSTANTIATE(af_min_t, short , short )
+INSTANTIATE(af_min_t, ushort , ushort )
+
+//max
+INSTANTIATE(af_max_t, float , float )
+INSTANTIATE(af_max_t, double , double )
+INSTANTIATE(af_max_t, cfloat , cfloat )
+INSTANTIATE(af_max_t, cdouble, cdouble)
+INSTANTIATE(af_max_t, int , int )
+INSTANTIATE(af_max_t, uint , uint )
+INSTANTIATE(af_max_t, intl , intl )
+INSTANTIATE(af_max_t, uintl , uintl )
+INSTANTIATE(af_max_t, char , char )
+INSTANTIATE(af_max_t, uchar , uchar )
+INSTANTIATE(af_max_t, short , short )
+INSTANTIATE(af_max_t, ushort , ushort )
+
+//sum
+INSTANTIATE(af_add_t, float , float )
+INSTANTIATE(af_add_t, double , double )
+INSTANTIATE(af_add_t, cfloat , cfloat )
+INSTANTIATE(af_add_t, cdouble, cdouble)
+INSTANTIATE(af_add_t, int , int )
+INSTANTIATE(af_add_t, int , float )
+INSTANTIATE(af_add_t, uint , uint )
+INSTANTIATE(af_add_t, uint , float )
+INSTANTIATE(af_add_t, intl , intl )
+INSTANTIATE(af_add_t, intl , double )
+INSTANTIATE(af_add_t, uintl , uintl )
+INSTANTIATE(af_add_t, uintl , double )
+INSTANTIATE(af_add_t, char , int )
+INSTANTIATE(af_add_t, char , float )
+INSTANTIATE(af_add_t, uchar , uint )
+INSTANTIATE(af_add_t, uchar , float )
+INSTANTIATE(af_add_t, short , int )
+INSTANTIATE(af_add_t, short , float )
+INSTANTIATE(af_add_t, ushort , uint )
+INSTANTIATE(af_add_t, ushort , float )
+
+//mul
+INSTANTIATE(af_mul_t, float , float )
+INSTANTIATE(af_mul_t, double , double )
+INSTANTIATE(af_mul_t, cfloat , cfloat )
+INSTANTIATE(af_mul_t, cdouble, cdouble)
+INSTANTIATE(af_mul_t, int , int )
+INSTANTIATE(af_mul_t, uint , uint )
+INSTANTIATE(af_mul_t, intl , intl )
+INSTANTIATE(af_mul_t, uintl , uintl )
+INSTANTIATE(af_mul_t, char , int )
+INSTANTIATE(af_mul_t, uchar , uint )
+INSTANTIATE(af_mul_t, short , int )
+INSTANTIATE(af_mul_t, ushort , uint )
+
+// count
+INSTANTIATE(af_notzero_t, float , uint)
+INSTANTIATE(af_notzero_t, double , uint)
+INSTANTIATE(af_notzero_t, cfloat , uint)
+INSTANTIATE(af_notzero_t, cdouble, uint)
+INSTANTIATE(af_notzero_t, int , uint)
+INSTANTIATE(af_notzero_t, uint , uint)
+INSTANTIATE(af_notzero_t, intl , uint)
+INSTANTIATE(af_notzero_t, uintl , uint)
+INSTANTIATE(af_notzero_t, char , uint)
+INSTANTIATE(af_notzero_t, uchar , uint)
+INSTANTIATE(af_notzero_t, short , uint)
+INSTANTIATE(af_notzero_t, ushort , uint)
+
+//anytrue
+INSTANTIATE(af_or_t, float , char)
+INSTANTIATE(af_or_t, double , char)
+INSTANTIATE(af_or_t, cfloat , char)
+INSTANTIATE(af_or_t, cdouble, char)
+INSTANTIATE(af_or_t, int , char)
+INSTANTIATE(af_or_t, uint , char)
+INSTANTIATE(af_or_t, intl , char)
+INSTANTIATE(af_or_t, uintl , char)
+INSTANTIATE(af_or_t, char , char)
+INSTANTIATE(af_or_t, uchar , char)
+INSTANTIATE(af_or_t, short , char)
+INSTANTIATE(af_or_t, ushort , char)
+
+//alltrue
+INSTANTIATE(af_and_t, float , char)
+INSTANTIATE(af_and_t, double , char)
+INSTANTIATE(af_and_t, cfloat , char)
+INSTANTIATE(af_and_t, cdouble, char)
+INSTANTIATE(af_and_t, int , char)
+INSTANTIATE(af_and_t, uint , char)
+INSTANTIATE(af_and_t, intl , char)
+INSTANTIATE(af_and_t, uintl , char)
+INSTANTIATE(af_and_t, char , char)
+INSTANTIATE(af_and_t, uchar , char)
+INSTANTIATE(af_and_t, short , char)
+INSTANTIATE(af_and_t, ushort , char)
+
}
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp
index 7d75582..afe5620 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/reorder.cpp
@@ -53,6 +53,8 @@ namespace cpu
template<typename T>
Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
{
+ in.eval();
+
const af::dim4 iDims = in.dims();
af::dim4 oDims(0);
for(int i = 0; i < 4; i++)
diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp
index d9ca084..67aa586 100644
--- a/src/backend/cpu/set.cpp
+++ b/src/backend/cpu/set.cpp
@@ -31,12 +31,15 @@ namespace cpu
const bool is_sorted)
{
in.eval();
- getQueue().sync();
Array<T> out = createEmptyArray<T>(af::dim4());
if (is_sorted) out = copyArray<T>(in);
else out = sort<T, 1>(in, 0);
+ // Need to sync old jobs since we need to
+ // operator on pointers directly in std::unique
+ getQueue().sync();
+
T *ptr = out.get();
T *last = std::unique(ptr, ptr + in.elements());
dim_t dist = (dim_t)std::distance(ptr, last);
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp
index 684b9ba..d2ebd42 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/sort_by_key.cpp
@@ -27,84 +27,92 @@ using std::async;
namespace cpu
{
- ///////////////////////////////////////////////////////////////////////////
- // Kernel Functions
- ///////////////////////////////////////////////////////////////////////////
-
- template<typename Tk, typename Tv, bool isAscending>
- void sort0_by_key(Array<Tk> okey, Array<Tv> oval, const Array<Tk> ikey, const Array<Tv> ival)
- {
- function<bool(Tk, Tk)> op = greater<Tk>();
- if(isAscending) { op = less<Tk>(); }
-
- // Get pointers and initialize original index locations
- Array<uint> oidx = createValueArray(ikey.dims(), 0u);
- uint *oidx_ptr = oidx.get();
- Tk *okey_ptr = okey.get();
- Tv *oval_ptr = oval.get();
- const Tk *ikey_ptr = ikey.get();
- const Tv *ival_ptr = ival.get();
-
- std::vector<uint> seq_vec(oidx.dims()[0]);
- std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
- const Tk *comp_ptr = nullptr;
- auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
- for(dim_t w = 0; w < ikey.dims()[3]; w++) {
- dim_t okeyW = w * okey.strides()[3];
- dim_t ovalW = w * oval.strides()[3];
- dim_t oidxW = w * oidx.strides()[3];
- dim_t ikeyW = w * ikey.strides()[3];
- dim_t ivalW = w * ival.strides()[3];
-
- for(dim_t z = 0; z < ikey.dims()[2]; z++) {
- dim_t okeyWZ = okeyW + z * okey.strides()[2];
- dim_t ovalWZ = ovalW + z * oval.strides()[2];
- dim_t oidxWZ = oidxW + z * oidx.strides()[2];
- dim_t ikeyWZ = ikeyW + z * ikey.strides()[2];
- dim_t ivalWZ = ivalW + z * ival.strides()[2];
-
- for(dim_t y = 0; y < ikey.dims()[1]; y++) {
-
- dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
- dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
- dim_t oidxOffset = oidxWZ + y * oidx.strides()[1];
- dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1];
- dim_t ivalOffset = ivalWZ + y * ival.strides()[1];
-
- uint *ptr = oidx_ptr + oidxOffset;
- std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
- comp_ptr = ikey_ptr + ikeyOffset;
- std::stable_sort(ptr, ptr + ikey.dims()[0], comparator);
-
- for (dim_t i = 0; i < oval.dims()[0]; ++i){
- uint sortIdx = oidx_ptr[oidxOffset + i];
- okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx];
- oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx];
- }
+
+///////////////////////////////////////////////////////////////////////////
+// Kernel Functions
+///////////////////////////////////////////////////////////////////////////
+
+template<typename Tk, typename Tv, bool isAscending>
+void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
+ const Array<Tk> ikey, const Array<Tv> ival)
+{
+ function<bool(Tk, Tk)> op = greater<Tk>();
+ if(isAscending) { op = less<Tk>(); }
+
+ // Get pointers and initialize original index locations
+ uint *oidx_ptr = oidx.get();
+ Tk *okey_ptr = okey.get();
+ Tv *oval_ptr = oval.get();
+ const Tk *ikey_ptr = ikey.get();
+ const Tv *ival_ptr = ival.get();
+
+ std::vector<uint> seq_vec(oidx.dims()[0]);
+ std::iota(seq_vec.begin(), seq_vec.end(), 0);
+
+ const Tk *comp_ptr = nullptr;
+ auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
+
+ for(dim_t w = 0; w < ikey.dims()[3]; w++) {
+ dim_t okeyW = w * okey.strides()[3];
+ dim_t ovalW = w * oval.strides()[3];
+ dim_t oidxW = w * oidx.strides()[3];
+ dim_t ikeyW = w * ikey.strides()[3];
+ dim_t ivalW = w * ival.strides()[3];
+
+ for(dim_t z = 0; z < ikey.dims()[2]; z++) {
+ dim_t okeyWZ = okeyW + z * okey.strides()[2];
+ dim_t ovalWZ = ovalW + z * oval.strides()[2];
+ dim_t oidxWZ = oidxW + z * oidx.strides()[2];
+ dim_t ikeyWZ = ikeyW + z * ikey.strides()[2];
+ dim_t ivalWZ = ivalW + z * ival.strides()[2];
+
+ for(dim_t y = 0; y < ikey.dims()[1]; y++) {
+
+ dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
+ dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
+ dim_t oidxOffset = oidxWZ + y * oidx.strides()[1];
+ dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1];
+ dim_t ivalOffset = ivalWZ + y * ival.strides()[1];
+
+ uint *ptr = oidx_ptr + oidxOffset;
+ std::copy(seq_vec.begin(), seq_vec.end(), ptr);
+
+ comp_ptr = ikey_ptr + ikeyOffset;
+ std::stable_sort(ptr, ptr + ikey.dims()[0], comparator);
+
+ for (dim_t i = 0; i < oval.dims()[0]; ++i){
+ uint sortIdx = oidx_ptr[oidxOffset + i];
+ okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx];
+ oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx];
}
}
}
-
- return;
}
- ///////////////////////////////////////////////////////////////////////////
- // Wrapper Functions
- ///////////////////////////////////////////////////////////////////////////
- template<typename Tk, typename Tv, bool isAscending>
- void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
- const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
- {
- okey = createEmptyArray<Tk>(ikey.dims());
- oval = createEmptyArray<Tv>(ival.dims());
- switch(dim) {
- case 0: getQueue().enqueue(sort0_by_key<Tk, Tv, isAscending>, okey, oval, ikey, ival); break;
- default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
- }
+ return;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Wrapper Functions
+///////////////////////////////////////////////////////////////////////////
+template<typename Tk, typename Tv, bool isAscending>
+void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
+ const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
+{
+ ikey.eval();
+ ival.eval();
+
+ okey = createEmptyArray<Tk>(ikey.dims());
+ oval = createEmptyArray<Tv>(ival.dims());
+ Array<uint> oidx = createValueArray(ikey.dims(), 0u);
+ oidx.eval();
+
+ switch(dim) {
+ case 0: getQueue().enqueue(sort0_by_key<Tk, Tv, isAscending>,
+ okey, oval, oidx, ikey, ival); break;
+ default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
}
+}
#define INSTANTIATE(Tk, Tv) \
template void \
@@ -127,14 +135,15 @@ namespace cpu
INSTANTIATE(Tk, uintl) \
- INSTANTIATE1(float)
- INSTANTIATE1(double)
- INSTANTIATE1(int)
- INSTANTIATE1(uint)
- INSTANTIATE1(char)
- INSTANTIATE1(uchar)
- INSTANTIATE1(short)
- INSTANTIATE1(ushort)
- INSTANTIATE1(intl)
- INSTANTIATE1(uintl)
+INSTANTIATE1(float)
+INSTANTIATE1(double)
+INSTANTIATE1(int)
+INSTANTIATE1(uint)
+INSTANTIATE1(char)
+INSTANTIATE1(uchar)
+INSTANTIATE1(short)
+INSTANTIATE1(ushort)
+INSTANTIATE1(intl)
+INSTANTIATE1(uintl)
+
}
diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp
index 33bfab7..39cbb66 100644
--- a/src/backend/cpu/svd.cpp
+++ b/src/backend/cpu/svd.cpp
@@ -68,6 +68,11 @@ namespace cpu
template <typename T, typename Tr>
void svdInPlace(Array<Tr> &s, Array<T> &u, Array<T> &vt, Array<T> &in)
{
+ s.eval();
+ u.eval();
+ vt.eval();
+ in.eval();
+
auto func = [=] (Array<Tr> s, Array<T> u, Array<T> vt, Array<T> in) {
dim4 iDims = in.dims();
int M = iDims[0];
diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp
index f756012..4f03545 100644
--- a/src/backend/cpu/tile.cpp
+++ b/src/backend/cpu/tile.cpp
@@ -20,6 +20,8 @@ namespace cpu
template<typename T>
Array<T> tile(const Array<T> &in, const af::dim4 &tileDims)
{
+ in.eval();
+
const af::dim4 iDims = in.dims();
af::dim4 oDims = iDims;
oDims *= tileDims;
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp
index c89243b..c3a8a37 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/transpose.cpp
@@ -52,9 +52,15 @@ cdouble getConjugate(const cdouble &in)
}
template<typename T, bool conjugate>
-void transpose_(T *out, const T *in, const af::dim4 &odims, const af::dim4 &idims,
- const af::dim4 &ostrides, const af::dim4 &istrides)
+void transpose_(Array<T> output, const Array<T> input)
{
+ const dim4 odims = output.dims();
+ const dim4 ostrides = output.strides();
+ const dim4 istrides = input.strides();
+
+ T * out = output.get();
+ T const * const in = input.get();
+
for (dim_t l = 0; l < odims[3]; ++l) {
for (dim_t k = 0; k < odims[2]; ++k) {
// Outermost loop handles batch mode
@@ -82,35 +88,32 @@ void transpose_(T *out, const T *in, const af::dim4 &odims, const af::dim4 &idim
template<typename T>
void transpose_(Array<T> out, const Array<T> in, const bool conjugate)
{
- // get data pointers for input and output Arrays
- T* outData = out.get();
- const T* inData = in.get();
-
- if(conjugate) {
- transpose_<T, true>(outData, inData,
- out.dims(), in.dims(), out.strides(), in.strides());
- } else {
- transpose_<T, false>(outData, inData,
- out.dims(), in.dims(), out.strides(), in.strides());
- }
+ return (conjugate ? transpose_<T, true>(out, in) : transpose_<T, false>(out, in));
}
template<typename T>
Array<T> transpose(const Array<T> &in, const bool conjugate)
{
- const dim4 inDims = in.dims();
-
- dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]);
+ in.eval();
+ const dim4 inDims = in.dims();
+ const dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]);
// create an array with first two dimensions swapped
Array<T> out = createEmptyArray<T>(outDims);
+
getQueue().enqueue(transpose_<T>, out, in, conjugate);
+
return out;
}
template<typename T, bool conjugate>
-void transpose_inplace(T *in, const af::dim4 &idims, const af::dim4 &istrides)
+void transpose_inplace(Array<T> input)
{
+ const dim4 idims = input.dims();
+ const dim4 istrides = input.strides();
+
+ T * in = input.get();
+
for (dim_t l = 0; l < idims[3]; ++l) {
for (dim_t k = 0; k < idims[2]; ++k) {
// Outermost loop handles batch mode
@@ -141,19 +144,13 @@ void transpose_inplace(T *in, const af::dim4 &idims, const af::dim4 &istrides)
template<typename T>
void transpose_inplace_(Array<T> in, const bool conjugate)
{
- // get data pointers for input and output Arrays
- T* inData = in.get();
-
- if(conjugate) {
- transpose_inplace<T, true >(inData, in.dims(), in.strides());
- } else {
- transpose_inplace<T, false>(inData, in.dims(), in.strides());
- }
+ return (conjugate ? transpose_inplace<T, true >(in) : transpose_inplace<T, false>(in));
}
template<typename T>
void transpose_inplace(Array<T> &in, const bool conjugate)
{
+ in.eval();
getQueue().enqueue(transpose_inplace_<T>, in, conjugate);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list