[arrayfire] 87/408: FEAT: Added support to substitute nan values for sum and product
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Sep 21 19:11:24 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.
commit 3b2ce09be4b669353df6d4737609b423d7906ccd
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date: Thu Jul 2 15:03:31 2015 -0400
FEAT: Added support to substitute nan values for sum and product
---
include/af/algorithm.h | 126 ++++++++++++++++++++++++++++++
src/api/c/reduce.cpp | 71 +++++++++++------
src/api/cpp/reduce.cpp | 81 ++++++++++++++-----
src/backend/cpu/reduce.cpp | 35 +++++----
src/backend/cpu/reduce.hpp | 6 +-
src/backend/cuda/kernel/reduce.hpp | 88 +++++++++++----------
src/backend/cuda/reduce.hpp | 6 +-
src/backend/cuda/reduce_impl.hpp | 13 +--
src/backend/opencl/kernel/reduce.hpp | 94 +++++++++++++---------
src/backend/opencl/kernel/reduce_dim.cl | 4 +-
src/backend/opencl/kernel/reduce_first.cl | 4 +-
src/backend/opencl/reduce.hpp | 5 +-
src/backend/opencl/reduce_impl.hpp | 13 +--
13 files changed, 388 insertions(+), 158 deletions(-)
diff --git a/include/af/algorithm.h b/include/af/algorithm.h
index a4d8833..e274d17 100644
--- a/include/af/algorithm.h
+++ b/include/af/algorithm.h
@@ -29,6 +29,19 @@ namespace af
AFAPI array sum(const array &in, const int dim = -1);
/**
+ C++ Interface for sum of elements in an array while replacing nan values
+
+ \param[in] in is the input array
+ \param[in] dim The dimension along which the add operation occurs
+ \param[in] nanval Replace nans with the value passed to this function
+ \return result of sum all values along dimension \p dim
+
+ \ingroup reduce_func_sum
+
+ */
+ AFAPI array sum(const array &in, const int dim, const double nanval);
+
+ /**
C++ Interface for product of elements in an array
\param[in] in is the input array
@@ -42,6 +55,20 @@ namespace af
AFAPI array product(const array &in, const int dim = -1);
/**
+ C++ Interface for product of elements in an array while replacing nan values
+
+ \param[in] in is the input array
+ \param[in] dim The dimension along which the add operation occurs
+ \param[in] nanval Replace nans with the value passed to this function
+ \return result of product all values along dimension \p dim
+
+ \ingroup reduce_func_product
+
+ */
+ AFAPI array product(const array &in, const int dim, const double nanval);
+
+
+ /**
C++ Interface for minimum values in an array
\param[in] in is the input array
@@ -51,6 +78,7 @@ namespace af
\ingroup reduce_func_min
\note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+ \note NaN values are ignored
*/
AFAPI array min(const array &in, const int dim = -1);
@@ -64,6 +92,7 @@ namespace af
\ingroup reduce_func_max
\note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+ \note NaN values are ignored
*/
AFAPI array max(const array &in, const int dim = -1);
@@ -77,6 +106,7 @@ namespace af
\ingroup reduce_func_all_true
\note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+ \note NaN values are ignored
*/
AFAPI array allTrue(const array &in, const int dim = -1);
@@ -90,6 +120,7 @@ namespace af
\ingroup reduce_func_any_true
\note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+ \note NaN values are ignored
*/
AFAPI array anyTrue(const array &in, const int dim = -1);
@@ -103,6 +134,7 @@ namespace af
\ingroup reduce_func_count
\note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+ \note NaN values are treated as non zero.
*/
AFAPI array count(const array &in, const int dim = -1);
@@ -117,6 +149,17 @@ namespace af
template<typename T> T sum(const array &in);
/**
+ C++ Interface for sum of all elements in an array while replacing nan values
+
+ \param[in] in is the input array
+ \param[in] nanval Replace nans with the value passed to this function
+ \return the sum of all values of \p in
+
+ \ingroup reduce_func_sum
+ */
+ template<typename T> T sum(const array &in, double nanval);
+
+ /**
C++ Interface for product of all elements in an array
\param[in] in is the input array
@@ -127,12 +170,25 @@ namespace af
template<typename T> T product(const array &in);
/**
+ C++ Interface for product of all elements in an array while replacing nan values
+
+ \param[in] in is the input array
+ \param[in] nanval Replace nans with the value passed to this function
+ \return the product of all values of \p in
+
+ \ingroup reduce_func_product
+ */
+ template<typename T> T product(const array &in, double nanval);
+
+ /**
C++ Interface for getting minimum value of an array
\param[in] in is the input array
\return the minimum of all values of \p in
\ingroup reduce_func_min
+
+ \note NaN values are ignored
*/
template<typename T> T min(const array &in);
@@ -143,6 +199,8 @@ namespace af
\return the maximum of all values of \p in
\ingroup reduce_func_max
+
+ \note NaN values are ignored
*/
template<typename T> T max(const array &in);
@@ -153,6 +211,8 @@ namespace af
\return true if all values of \p in are true, false otherwise
\ingroup reduce_func_all_true
+
+ \note NaN values are ignored
*/
template<typename T> T allTrue(const array &in);
@@ -163,6 +223,8 @@ namespace af
\return true if any values of \p in are true, false otherwise
\ingroup reduce_func_any_true
+
+ \note NaN values are ignored
*/
template<typename T> T anyTrue(const array &in);
@@ -173,6 +235,8 @@ namespace af
\return the number of non-zero values in \p in
\ingroup reduce_func_count
+
+ \note NaN values are treated as non zero
*/
template<typename T> T count(const array &in);
@@ -187,6 +251,8 @@ namespace af
\ingroup reduce_func_min
\note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+
+ \note NaN values are ignored
*/
AFAPI void min(array &val, array &idx, const array &in, const int dim = -1);
@@ -201,6 +267,8 @@ namespace af
\ingroup reduce_func_max
\note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+
+ \note NaN values are ignored
*/
AFAPI void max(array &val, array &idx, const array &in, const int dim = -1);
@@ -212,6 +280,8 @@ namespace af
\param[in] in is the input array
\ingroup reduce_func_min
+
+ \note NaN values are ignored
*/
template<typename T> void min(T *val, unsigned *idx, const array &in);
@@ -223,6 +293,8 @@ namespace af
\param[in] in is the input array
\ingroup reduce_func_max
+
+ \note NaN values are ignored
*/
template<typename T> void max(T *val, unsigned *idx, const array &in);
@@ -369,6 +441,19 @@ extern "C" {
AFAPI af_err af_sum(af_array *out, const af_array in, const int dim);
/**
+ C Interface for sum of elements in an array while replacing nans
+
+ \param[out] out will contain the sum of all values in \p in along \p dim
+ \param[in] in is the input array
+ \param[in] dim The dimension along which the add operation occurs
+ \param[in] nanval Replace nans with the value passed to this function
+ \return \ref AF_SUCCESS if the execution completes properly
+
+ \ingroup reduce_func_sum
+ */
+ AFAPI af_err af_sum_nan(af_array *out, const af_array in, const int dim, const double nanval);
+
+ /**
C Interface for product of elements in an array
\param[out] out will contain the product of all values in \p in along \p dim
@@ -381,6 +466,19 @@ extern "C" {
AFAPI af_err af_product(af_array *out, const af_array in, const int dim);
/**
+ C Interface for product of elements in an array while replacing nans
+
+ \param[out] out will contain the product of all values in \p in along \p dim
+ \param[in] in is the input array
+ \param[in] dim The dimension along which the add operation occurs
+ \param[in] nanval Replace nans with the value passed to this function
+ \return \ref AF_SUCCESS if the execution completes properly
+
+ \ingroup reduce_func_product
+ */
+ AFAPI af_err af_product_nan(af_array *out, const af_array in, const int dim, const double nanval);
+
+ /**
C Interface for minimum values in an array
\param[out] out will contain the minimum of all values in \p in along \p dim
@@ -455,6 +553,20 @@ extern "C" {
AFAPI af_err af_sum_all(double *real, double *imag, const af_array in);
/**
+ C Interface for sum of all elements in an array while replacing nans
+
+ \param[out] real will contain the real part of adding all elements in input \p in
+ \param[out] imag will contain the imaginary part of adding all elements in input \p in
+ \param[in] in is the input array
+ \return \ref AF_SUCCESS if the execution completes properly
+
+ \note \p imag is always set to 0 when \p in is real
+
+ \ingroup reduce_func_sum
+ */
+ AFAPI af_err af_sum_nan_all(double *real, double *imag, const af_array in, const double nanval);
+
+ /**
C Interface for product of all elements in an array
\param[out] real will contain the real part of multiplying all elements in input \p in
@@ -469,6 +581,20 @@ extern "C" {
AFAPI af_err af_product_all(double *real, double *imag, const af_array in);
/**
+ C Interface for product of all elements in an array while replacing nans
+
+ \param[out] real will contain the real part of adding all elements in input \p in
+ \param[out] imag will contain the imaginary part of adding all elements in input \p in
+ \param[in] in is the input array
+ \return \ref AF_SUCCESS if the execution completes properly
+
+ \note \p imag is always set to 0 when \p in is real
+
+ \ingroup reduce_func_product
+ */
+ AFAPI af_err af_product_nan_all(double *real, double *imag, const af_array in, const double nanval);
+
+ /**
C Interface for getting minimum value of an array
\param[out] real will contain the real part of minimum value of all elements in input \p in
diff --git a/src/api/c/reduce.cpp b/src/api/c/reduce.cpp
index f4d5e61..cedf4f9 100644
--- a/src/api/c/reduce.cpp
+++ b/src/api/c/reduce.cpp
@@ -23,9 +23,10 @@ using af::dim4;
using namespace detail;
template<af_op_t op, typename Ti, typename To>
-static inline af_array reduce(const af_array in, const int dim)
+static inline af_array reduce(const af_array in, const int dim,
+ bool change_nan = false, double nanval = 0)
{
- return getHandle(reduce<op,Ti,To>(getArray<Ti>(in), dim));
+ return getHandle(reduce<op,Ti,To>(getArray<Ti>(in), dim, change_nan, nanval));
}
template<af_op_t op, typename To>
@@ -107,7 +108,8 @@ static af_err reduce_common(af_array *out, const af_array in, const int dim)
}
template<af_op_t op>
-static af_err reduce_promote(af_array *out, const af_array in, const int dim)
+static af_err reduce_promote(af_array *out, const af_array in, const int dim,
+ bool change_nan=false, double nanval=0)
{
try {
@@ -125,17 +127,17 @@ static af_err reduce_promote(af_array *out, const af_array in, const int dim)
af_array res;
switch(type) {
- case f32: res = reduce<op, float , float >(in, dim); break;
- case f64: res = reduce<op, double , double >(in, dim); break;
- case c32: res = reduce<op, cfloat , cfloat >(in, dim); break;
- case c64: res = reduce<op, cdouble, cdouble>(in, dim); break;
- case u32: res = reduce<op, uint , uint >(in, dim); break;
- case s32: res = reduce<op, int , int >(in, dim); break;
- case u64: res = reduce<op, uintl , uintl >(in, dim); break;
- case s64: res = reduce<op, intl , intl >(in, dim); break;
- case u8: res = reduce<op, uchar , uint >(in, dim); break;
+ case f32: res = reduce<op, float , float >(in, dim, change_nan, nanval); break;
+ case f64: res = reduce<op, double , double >(in, dim, change_nan, nanval); break;
+ case c32: res = reduce<op, cfloat , cfloat >(in, dim, change_nan, nanval); break;
+ case c64: res = reduce<op, cdouble, cdouble>(in, dim, change_nan, nanval); break;
+ case u32: res = reduce<op, uint , uint >(in, dim, change_nan, nanval); break;
+ case s32: res = reduce<op, int , int >(in, dim, change_nan, nanval); break;
+ case u64: res = reduce<op, uintl , uintl >(in, dim, change_nan, nanval); break;
+ case s64: res = reduce<op, intl , intl >(in, dim, change_nan, nanval); break;
+ case u8: res = reduce<op, uchar , uint >(in, dim, change_nan, nanval); break;
// Make sure you are adding only "1" for every non zero value, even if op == af_add_t
- case b8: res = reduce<af_notzero_t, char , uint >(in, dim); break;
+ case b8: res = reduce<af_notzero_t, char , uint >(in, dim, change_nan, nanval); break;
default: TYPE_ERROR(1, type);
}
std::swap(*out, res);
@@ -165,6 +167,16 @@ af_err af_product(af_array *out, const af_array in, const int dim)
return reduce_promote<af_mul_t>(out, in, dim);
}
+af_err af_sum_nan(af_array *out, const af_array in, const int dim, const double nanval)
+{
+ return reduce_promote<af_add_t>(out, in, dim, true, nanval);
+}
+
+af_err af_product_nan(af_array *out, const af_array in, const int dim, const double nanval)
+{
+ return reduce_promote<af_mul_t>(out, in, dim, true, nanval);
+}
+
af_err af_count(af_array *out, const af_array in, const int dim)
{
return reduce_type<af_notzero_t, uint>(out, in, dim);
@@ -181,9 +193,9 @@ af_err af_any_true(af_array *out, const af_array in, const int dim)
}
template<af_op_t op, typename Ti, typename To>
-static inline To reduce_all(const af_array in)
+static inline To reduce_all(const af_array in, bool change_nan = false, double nanval = 0)
{
- return reduce_all<op,Ti,To>(getArray<Ti>(in));
+ return reduce_all<op,Ti,To>(getArray<Ti>(in), change_nan, nanval);
}
template<af_op_t op, typename To>
@@ -267,7 +279,8 @@ static af_err reduce_all_common(double *real_val, double *imag_val, const af_arr
}
template<af_op_t op>
-static af_err reduce_all_promote(double *real_val, double *imag_val, const af_array in)
+static af_err reduce_all_promote(double *real_val, double *imag_val, const af_array in,
+ bool change_nan=false, double nanval=0)
{
try {
@@ -282,15 +295,15 @@ static af_err reduce_all_promote(double *real_val, double *imag_val, const af_ar
cdouble cdval;
switch(type) {
- case f32: *real_val = (double)reduce_all<op, float , float >(in); break;
- case f64: *real_val = (double)reduce_all<op, double , double >(in); break;
- case u32: *real_val = (double)reduce_all<op, uint , uint >(in); break;
- case s32: *real_val = (double)reduce_all<op, int , int >(in); break;
- case u64: *real_val = (double)reduce_all<op, uintl , uintl >(in); break;
- case s64: *real_val = (double)reduce_all<op, intl , intl >(in); break;
- case u8: *real_val = (double)reduce_all<op, uchar , uint >(in); break;
+ case f32: *real_val = (double)reduce_all<op, float , float >(in, change_nan, nanval); break;
+ case f64: *real_val = (double)reduce_all<op, double , double >(in, change_nan, nanval); break;
+ case u32: *real_val = (double)reduce_all<op, uint , uint >(in, change_nan, nanval); break;
+ case s32: *real_val = (double)reduce_all<op, int , int >(in, change_nan, nanval); break;
+ case u64: *real_val = (double)reduce_all<op, uintl , uintl >(in, change_nan, nanval); break;
+ case s64: *real_val = (double)reduce_all<op, intl , intl >(in, change_nan, nanval); break;
+ case u8: *real_val = (double)reduce_all<op, uchar , uint >(in, change_nan, nanval); break;
// Make sure you are adding only "1" for every non zero value, even if op == af_add_t
- case b8: *real_val = (double)reduce_all<af_notzero_t, char , uint >(in); break;
+ case b8: *real_val = (double)reduce_all<af_notzero_t, char, uint>(in, change_nan, nanval); break;
case c32:
cfval = reduce_all<op, cfloat, cfloat>(in);
@@ -479,3 +492,13 @@ af_err af_imax_all(double *real, double *imag, unsigned *idx, const af_array in)
{
return ireduce_all_common<af_max_t>(real, imag, idx, in);
}
+
+af_err af_sum_nan_all(double *real, double *imag, const af_array in, const double nanval)
+{
+ return reduce_all_promote<af_add_t>(real, imag, in, true, nanval);
+}
+
+af_err af_product_nan_all(double *real, double *imag, const af_array in, const double nanval)
+{
+ return reduce_all_promote<af_mul_t>(real, imag, in, true, nanval);
+}
diff --git a/src/api/cpp/reduce.cpp b/src/api/cpp/reduce.cpp
index 63d0200..7be05bb 100644
--- a/src/api/cpp/reduce.cpp
+++ b/src/api/cpp/reduce.cpp
@@ -22,6 +22,13 @@ namespace af
return array(out);
}
+ array sum(const array &in, const int dim, const double nanval)
+ {
+ af_array out = 0;
+ AF_THROW(af_sum_nan(&out, in.get(), dim, nanval));
+ return array(out);
+ }
+
array product(const array &in, const int dim)
{
af_array out = 0;
@@ -29,6 +36,13 @@ namespace af
return array(out);
}
+ array product(const array &in, const int dim, const double nanval)
+ {
+ af_array out = 0;
+ AF_THROW(af_product_nan(&out, in.get(), dim, nanval));
+ return array(out);
+ }
+
array mul(const array &in, const int dim)
{
return product(in, dim);
@@ -91,6 +105,21 @@ namespace af
idx = array(loc);
}
+
+#define INSTANTIATE(fnC, fnCPP) \
+ INSTANTIATE_REAL(fnC, fnCPP, float) \
+ INSTANTIATE_REAL(fnC, fnCPP, double) \
+ INSTANTIATE_REAL(fnC, fnCPP, int) \
+ INSTANTIATE_REAL(fnC, fnCPP, unsigned) \
+ INSTANTIATE_REAL(fnC, fnCPP, long) \
+ INSTANTIATE_REAL(fnC, fnCPP, unsigned long) \
+ INSTANTIATE_REAL(fnC, fnCPP, long long) \
+ INSTANTIATE_REAL(fnC, fnCPP, unsigned long long) \
+ INSTANTIATE_REAL(fnC, fnCPP, char) \
+ INSTANTIATE_REAL(fnC, fnCPP, unsigned char) \
+ INSTANTIATE_CPLX(fnC, fnCPP, af_cfloat, float) \
+ INSTANTIATE_CPLX(fnC, fnCPP, af_cdouble, double) \
+
#define INSTANTIATE_REAL(fnC, fnCPP, T) \
template<> AFAPI \
T fnCPP(const array &in) \
@@ -111,20 +140,6 @@ namespace af
return out; \
} \
-#define INSTANTIATE(fnC, fnCPP) \
- INSTANTIATE_REAL(fnC, fnCPP, float) \
- INSTANTIATE_REAL(fnC, fnCPP, double) \
- INSTANTIATE_REAL(fnC, fnCPP, int) \
- INSTANTIATE_REAL(fnC, fnCPP, unsigned) \
- INSTANTIATE_REAL(fnC, fnCPP, long) \
- INSTANTIATE_REAL(fnC, fnCPP, unsigned long) \
- INSTANTIATE_REAL(fnC, fnCPP, long long) \
- INSTANTIATE_REAL(fnC, fnCPP, unsigned long long) \
- INSTANTIATE_REAL(fnC, fnCPP, char) \
- INSTANTIATE_REAL(fnC, fnCPP, unsigned char) \
- INSTANTIATE_CPLX(fnC, fnCPP, af_cfloat, float) \
- INSTANTIATE_CPLX(fnC, fnCPP, af_cdouble, double) \
-
INSTANTIATE(sum, sum)
INSTANTIATE(product, product)
INSTANTIATE(min, min)
@@ -136,15 +151,41 @@ namespace af
INSTANTIATE_REAL(all_true, allTrue, bool);
INSTANTIATE_REAL(any_true, anyTrue, bool);
-#undef INSTANTIATE
#undef INSTANTIATE_REAL
#undef INSTANTIATE_CPLX
-#define INSTANTIATE_COMPAT(fnCPP, fnCompat, T) \
- template<> AFAPI \
- T fnCompat(const array &in) \
- { \
- return fnCPP<T>(in); \
+#define INSTANTIATE_REAL(fnC, fnCPP, T) \
+ template<> AFAPI \
+ T fnCPP(const array &in, const double nanval) \
+ { \
+ double rval, ival; \
+ AF_THROW(af_##fnC##_all(&rval, &ival, in.get(), nanval)); \
+ return (T)(rval); \
+ } \
+
+
+#define INSTANTIATE_CPLX(fnC, fnCPP, T, Tr) \
+ template<> AFAPI \
+ T fnCPP(const array &in, const double nanval) \
+ { \
+ double rval, ival; \
+ AF_THROW(af_##fnC##_all(&rval, &ival, in.get(), nanval)); \
+ T out((Tr)rval, (Tr)ival); \
+ return out; \
+ } \
+
+INSTANTIATE(sum_nan, sum)
+INSTANTIATE(product_nan, product)
+
+#undef INSTANTIATE_REAL
+#undef INSTANTIATE_CPLX
+#undef INSTANTIATE
+
+#define INSTANTIATE_COMPAT(fnCPP, fnCompat, T) \
+ template<> AFAPI \
+ T fnCompat(const array &in) \
+ { \
+ return fnCPP<T>(in); \
}
#define INSTANTIATE(fnCPP, fnCompat) \
diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp
index 428e5d9..5724508 100644
--- a/src/backend/cpu/reduce.cpp
+++ b/src/backend/cpu/reduce.cpp
@@ -25,16 +25,16 @@ namespace cpu
{
void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
const Ti *in , const dim4 &istrides, const dim4 &idims,
- const int dim)
+ const int dim, bool change_nan, double nanval)
{
static const int D1 = D - 1;
static reduce_dim<op, Ti, To, D1> reduce_dim_next;
for (dim_t i = 0; i < odims[D1]; i++) {
- reduce_dim_next(out + i * ostrides[D1],
- ostrides, odims,
- in + i * istrides[D1],
- istrides, idims,
- dim);
+ reduce_dim_next(out + i * ostrides[D1],
+ ostrides, odims,
+ in + i * istrides[D1],
+ istrides, idims,
+ dim, change_nan, nanval);
}
}
};
@@ -47,13 +47,14 @@ namespace cpu
Binary<To, op> reduce;
void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
const Ti *in , const dim4 &istrides, const dim4 &idims,
- const int dim)
+ const int dim, bool change_nan, double nanval)
{
dim_t stride = istrides[dim];
To out_val = reduce.init();
for (dim_t i = 0; i < idims[dim]; i++) {
To in_val = transform(in[i * stride]);
+ if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
out_val = reduce(in_val, out_val);
}
@@ -64,10 +65,10 @@ namespace cpu
template<af_op_t op, typename Ti, typename To>
using reduce_dim_func = std::function<void(To*,const dim4&, const dim4&,
const Ti*, const dim4&, const dim4&,
- const int)>;
+ const int, bool, double)>;
template<af_op_t op, typename Ti, typename To>
- Array<To> reduce(const Array<Ti> &in, const int dim)
+ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
{
dim4 odims = in.dims();
odims[dim] = 1;
@@ -79,13 +80,14 @@ namespace cpu
, reduce_dim<op, Ti, To, 4>()};
reduce_funcs[in.ndims() - 1](out.get(), out.strides(), out.dims(),
- in.get(), in.strides(), in.dims(), dim);
+ in.get(), in.strides(), in.dims(), dim,
+ change_nan, nanval);
return out;
}
template<af_op_t op, typename Ti, typename To>
- To reduce_all(const Array<Ti> &in)
+ To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
{
Transform<Ti, To, op> transform;
Binary<To, op> reduce;
@@ -109,8 +111,9 @@ namespace cpu
for(dim_t i = 0; i < dims[0]; i++) {
dim_t idx = i + off1 + off2 + off3;
- To val = transform(inPtr[idx]);
- out = reduce(val, out);
+ To in_val = transform(inPtr[idx]);
+ if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+ out = reduce(in_val, out);
}
}
}
@@ -120,8 +123,10 @@ namespace cpu
}
#define INSTANTIATE(ROp, Ti, To) \
- template Array<To> reduce<ROp, Ti, To>(const Array<Ti> &in, const int dim); \
- template To reduce_all<ROp, Ti, To>(const Array<Ti> &in);
+ template Array<To> reduce<ROp, Ti, To>(const Array<Ti> &in, const int dim, \
+ bool change_nan, double nanval); \
+ template To reduce_all<ROp, Ti, To>(const Array<Ti> &in, \
+ bool change_nan, double nanval);
//min
INSTANTIATE(af_min_t, float , float )
diff --git a/src/backend/cpu/reduce.hpp b/src/backend/cpu/reduce.hpp
index 039a47d..4e139f0 100644
--- a/src/backend/cpu/reduce.hpp
+++ b/src/backend/cpu/reduce.hpp
@@ -6,7 +6,7 @@
* The complete license agreement can be obtained at:
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-
+#pragma once
#include <af/array.h>
#include <Array.hpp>
#include <ops.hpp>
@@ -14,8 +14,8 @@
namespace cpu
{
template<af_op_t op, typename Ti, typename To>
- Array<To> reduce(const Array<Ti> &in, const int dim);
+ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan=false, double nanval=0);
template<af_op_t op, typename Ti, typename To>
- To reduce_all(const Array<Ti> &in);
+ To reduce_all(const Array<Ti> &in, bool change_nan=false, double nanval=0);
}
diff --git a/src/backend/cuda/kernel/reduce.hpp b/src/backend/cuda/kernel/reduce.hpp
index a5961f3..0263142 100644
--- a/src/backend/cuda/kernel/reduce.hpp
+++ b/src/backend/cuda/kernel/reduce.hpp
@@ -29,7 +29,8 @@ namespace kernel
__global__
static void reduce_dim_kernel(Param<To> out,
CParam <Ti> in,
- uint blocks_x, uint blocks_y, uint offset_dim)
+ uint blocks_x, uint blocks_y, uint offset_dim,
+ bool change_nan, To nanval)
{
const uint tidx = threadIdx.x;
const uint tidy = threadIdx.y;
@@ -73,6 +74,7 @@ namespace kernel
To out_val = reduce.init();
for (int id = id_dim_in; is_valid && (id < in.dims[dim]); id += offset_dim * blockDim.y) {
To in_val = transform(*iptr);
+ if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
out_val = reduce(in_val, out_val);
iptr = iptr + offset_dim * blockDim.y * istride_dim;
}
@@ -106,7 +108,8 @@ namespace kernel
template<typename Ti, typename To, af_op_t op, int dim>
void reduce_dim_launcher(Param<To> out, CParam<Ti> in,
- const uint threads_y, const uint blocks_dim[4])
+ const uint threads_y, const uint blocks_dim[4],
+ bool change_nan, double nanval)
{
dim3 threads(THREADS_X, threads_y);
@@ -116,23 +119,27 @@ namespace kernel
switch (threads_y) {
case 8:
(reduce_dim_kernel<Ti, To, op, dim, 8>)<<<blocks, threads>>>(
- out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+ out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+ change_nan, scalar<To>(nanval)); break;
case 4:
(reduce_dim_kernel<Ti, To, op, dim, 4>)<<<blocks, threads>>>(
- out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+ out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+ change_nan, scalar<To>(nanval)); break;
case 2:
(reduce_dim_kernel<Ti, To, op, dim, 2>)<<<blocks, threads>>>(
- out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+ out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+ change_nan, scalar<To>(nanval)); break;
case 1:
(reduce_dim_kernel<Ti, To, op, dim, 1>)<<<blocks, threads>>>(
- out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+ out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+ change_nan, scalar<To>(nanval)); break;
}
POST_LAUNCH_CHECK();
}
template<typename Ti, typename To, af_op_t op, int dim>
- void reduce_dim(Param<To> out, CParam<Ti> in)
+ void reduce_dim(Param<To> out, CParam<Ti> in, bool change_nan, double nanval)
{
uint threads_y = std::min(THREADS_Y, nextpow2(in.dims[dim]));
uint threads_x = THREADS_X;
@@ -154,15 +161,17 @@ namespace kernel
for (int k = dim + 1; k < 4; k++) tmp.strides[k] *= blocks_dim[dim];
}
- reduce_dim_launcher<Ti, To, op, dim>(tmp, in, threads_y, blocks_dim);
+ reduce_dim_launcher<Ti, To, op, dim>(tmp, in, threads_y, blocks_dim, change_nan, nanval);
if (blocks_dim[dim] > 1) {
blocks_dim[dim] = 1;
if (op == af_notzero_t) {
- reduce_dim_launcher<To, To, af_add_t, dim>(out, tmp, threads_y, blocks_dim);
+ reduce_dim_launcher<To, To, af_add_t, dim>(out, tmp, threads_y, blocks_dim,
+ change_nan, nanval);
} else {
- reduce_dim_launcher<To, To, op, dim>(out, tmp, threads_y, blocks_dim);
+ reduce_dim_launcher<To, To, op, dim>(out, tmp, threads_y, blocks_dim,
+ change_nan, nanval);
}
memFree(tmp.ptr);
@@ -171,20 +180,6 @@ namespace kernel
}
template<typename To, af_op_t op>
- __device__ void warp_reduce_sync(To *s_ptr, uint tidx)
- {
-
- }
-
-#if (__CUDA_ARCH__ >= 300)
- template<typename To, af_op_t op>
- __device__ void warp_reduce_shfl(To *s_ptr, uint tidx)
- {
-
- }
-#endif
-
- template<typename To, af_op_t op>
struct WarpReduce
{
__device__ To operator()(To *s_ptr, uint tidx)
@@ -230,7 +225,8 @@ namespace kernel
__global__
static void reduce_first_kernel(Param<To> out,
CParam<Ti> in,
- uint blocks_x, uint blocks_y, uint repeat)
+ uint blocks_x, uint blocks_y, uint repeat,
+ bool change_nan, To nanval)
{
const uint tidx = threadIdx.x;
const uint tidy = threadIdx.y;
@@ -263,6 +259,7 @@ namespace kernel
for (int id = xid; id < lim; id += DIMX) {
To in_val = transform(iptr[id]);
+ if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
out_val = reduce(in_val, out_val);
}
@@ -294,7 +291,8 @@ namespace kernel
template<typename Ti, typename To, af_op_t op>
void reduce_first_launcher(Param<To> out, CParam<Ti> in,
- const uint blocks_x, const uint blocks_y, const uint threads_x)
+ const uint blocks_x, const uint blocks_y, const uint threads_x,
+ bool change_nan, double nanval)
{
dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
@@ -306,23 +304,23 @@ namespace kernel
switch (threads_x) {
case 32:
(reduce_first_kernel<Ti, To, op, 32>)<<<blocks, threads>>>(
- out, in, blocks_x, blocks_y, repeat); break;
+ out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
case 64:
(reduce_first_kernel<Ti, To, op, 64>)<<<blocks, threads>>>(
- out, in, blocks_x, blocks_y, repeat); break;
+ out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
case 128:
(reduce_first_kernel<Ti, To, op, 128>)<<<blocks, threads>>>(
- out, in, blocks_x, blocks_y, repeat); break;
+ out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
case 256:
(reduce_first_kernel<Ti, To, op, 256>)<<<blocks, threads>>>(
- out, in, blocks_x, blocks_y, repeat); break;
+ out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
}
POST_LAUNCH_CHECK();
}
template<typename Ti, typename To, af_op_t op>
- void reduce_first(Param<To> out, CParam<Ti> in)
+ void reduce_first(Param<To> out, CParam<Ti> in, bool change_nan, double nanval)
{
uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0]));
threads_x = std::min(threads_x, THREADS_PER_BLOCK);
@@ -342,15 +340,17 @@ namespace kernel
for (int k = 1; k < 4; k++) tmp.strides[k] *= blocks_x;
}
- reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x);
+ reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x, change_nan, nanval);
if (blocks_x > 1) {
//FIXME: Is there an alternative to the if condition ?
if (op == af_notzero_t) {
- reduce_first_launcher<To, To, af_add_t>(out, tmp, 1, blocks_y, threads_x);
+ reduce_first_launcher<To, To, af_add_t>(out, tmp, 1, blocks_y, threads_x,
+ change_nan, nanval);
} else {
- reduce_first_launcher<To, To, op>(out, tmp, 1, blocks_y, threads_x);
+ reduce_first_launcher<To, To, op>(out, tmp, 1, blocks_y, threads_x,
+ change_nan, nanval);
}
memFree(tmp.ptr);
@@ -358,18 +358,18 @@ namespace kernel
}
template<typename Ti, typename To, af_op_t op>
- void reduce(Param<To> out, CParam<Ti> in, int dim)
+ void reduce(Param<To> out, CParam<Ti> in, int dim, bool change_nan, double nanval)
{
switch (dim) {
- case 0: return reduce_first<Ti, To, op >(out, in);
- case 1: return reduce_dim <Ti, To, op, 1>(out, in);
- case 2: return reduce_dim <Ti, To, op, 2>(out, in);
- case 3: return reduce_dim <Ti, To, op, 3>(out, in);
+ case 0: return reduce_first<Ti, To, op >(out, in, change_nan, nanval);
+ case 1: return reduce_dim <Ti, To, op, 1>(out, in, change_nan, nanval);
+ case 2: return reduce_dim <Ti, To, op, 2>(out, in, change_nan, nanval);
+ case 3: return reduce_dim <Ti, To, op, 3>(out, in, change_nan, nanval);
}
}
template<typename Ti, typename To, af_op_t op>
- To reduce_all(CParam<Ti> in)
+ To reduce_all(CParam<Ti> in, bool change_nan, double nanval)
{
int in_elements = in.strides[3] * in.dims[3];
@@ -409,7 +409,8 @@ namespace kernel
int tmp_elements = tmp.strides[3] * tmp.dims[3];
tmp.ptr = memAlloc<To>(tmp_elements);
- reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x);
+ reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x,
+ change_nan, nanval);
scoped_ptr<To> h_ptr(new To[tmp_elements]);
To* h_ptr_raw = h_ptr.get();
@@ -434,9 +435,12 @@ namespace kernel
Transform<Ti, To, op> transform;
Binary<To, op> reduce;
To out = reduce.init();
+ To nanval_to = scalar<To>(nanval);
for (int i = 0; i < in_elements; i++) {
- out = reduce(out, transform(h_ptr_raw[i]));
+ To in_val = transform(h_ptr_raw[i]);
+ if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval_to;
+ out = reduce(out, in_val);
}
return out;
diff --git a/src/backend/cuda/reduce.hpp b/src/backend/cuda/reduce.hpp
index 2af2f5e..82755bc 100644
--- a/src/backend/cuda/reduce.hpp
+++ b/src/backend/cuda/reduce.hpp
@@ -6,7 +6,7 @@
* The complete license agreement can be obtained at:
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-
+#pragma once
#include <af/array.h>
#include <Array.hpp>
#include <ops.hpp>
@@ -14,8 +14,8 @@
namespace cuda
{
template<af_op_t op, typename Ti, typename To>
- Array<To> reduce(const Array<Ti> &in, const int dim);
+ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan=false, double nanval=0);
template<af_op_t op, typename Ti, typename To>
- To reduce_all(const Array<Ti> &in);
+ To reduce_all(const Array<Ti> &in, bool change_nan=false, double nanval=0);
}
diff --git a/src/backend/cuda/reduce_impl.hpp b/src/backend/cuda/reduce_impl.hpp
index abc6d5c..b1899f4 100644
--- a/src/backend/cuda/reduce_impl.hpp
+++ b/src/backend/cuda/reduce_impl.hpp
@@ -23,23 +23,24 @@ using af::dim4;
namespace cuda
{
template<af_op_t op, typename Ti, typename To>
- Array<To> reduce(const Array<Ti> &in, const int dim)
+ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
{
dim4 odims = in.dims();
odims[dim] = 1;
Array<To> out = createEmptyArray<To>(odims);
- kernel::reduce<Ti, To, op>(out, in, dim);
+ kernel::reduce<Ti, To, op>(out, in, dim, change_nan, nanval);
return out;
}
template<af_op_t op, typename Ti, typename To>
- To reduce_all(const Array<Ti> &in)
+ To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
{
- return kernel::reduce_all<Ti, To, op>(in);
+ return kernel::reduce_all<Ti, To, op>(in, change_nan, nanval);
}
}
#define INSTANTIATE(Op, Ti, To) \
- template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim); \
- template To reduce_all<Op, Ti, To>(const Array<Ti> &in);
+ template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim, \
+ bool change_nan, double nanval); \
+ template To reduce_all<Op, Ti, To>(const Array<Ti> &in, bool change_nan, double nanval);
diff --git a/src/backend/opencl/kernel/reduce.hpp b/src/backend/opencl/kernel/reduce.hpp
index 0d8be35..094b42f 100644
--- a/src/backend/opencl/kernel/reduce.hpp
+++ b/src/backend/opencl/kernel/reduce.hpp
@@ -42,7 +42,8 @@ namespace kernel
template<typename Ti, typename To, af_op_t op, int dim, int threads_y>
void reduce_dim_launcher(Param out, Param in,
- const uint groups_all[4])
+ const uint groups_all[4],
+ int change_nan, double nanval)
{
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static std::map<int, Program*> reduceProgs;
@@ -84,34 +85,49 @@ namespace kernel
auto reduceOp = make_kernel<Buffer, KParam,
Buffer, KParam,
- uint, uint, uint>(*reduceKerns[device]);
+ uint, uint, uint,
+ int, To>(*reduceKerns[device]);
reduceOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info,
*in.data, in.info,
groups_all[0],
groups_all[1],
- groups_all[dim]);
+ groups_all[dim],
+ change_nan,
+ scalar<To>(nanval));
CL_DEBUG_FINISH(getQueue());
}
template<typename Ti, typename To, af_op_t op, int dim>
void reduce_dim_fn(Param out, Param in,
- const uint threads_y, const uint groups_all[4])
+ const uint threads_y, const uint groups_all[4],
+ int change_nan, double nanval)
{
switch(threads_y) {
- case 8: return reduce_dim_launcher<Ti, To, op, dim, 8>(out, in, groups_all);
- case 4: return reduce_dim_launcher<Ti, To, op, dim, 4>(out, in, groups_all);
- case 2: return reduce_dim_launcher<Ti, To, op, dim, 2>(out, in, groups_all);
- case 1: return reduce_dim_launcher<Ti, To, op, dim, 1>(out, in, groups_all);
- case 16: return reduce_dim_launcher<Ti, To, op, dim, 16>(out, in, groups_all);
- case 32: return reduce_dim_launcher<Ti, To, op, dim, 32>(out, in, groups_all);
+ case 8: return reduce_dim_launcher<Ti, To, op, dim, 8>(out, in, groups_all,
+ change_nan, nanval);
+
+ case 4: return reduce_dim_launcher<Ti, To, op, dim, 4>(out, in, groups_all,
+ change_nan, nanval);
+
+ case 2: return reduce_dim_launcher<Ti, To, op, dim, 2>(out, in, groups_all,
+ change_nan, nanval);
+
+ case 1: return reduce_dim_launcher<Ti, To, op, dim, 1>(out, in, groups_all,
+ change_nan, nanval);
+
+ case 16: return reduce_dim_launcher<Ti, To, op, dim, 16>(out, in, groups_all,
+ change_nan, nanval);
+
+ case 32: return reduce_dim_launcher<Ti, To, op, dim, 32>(out, in, groups_all,
+ change_nan, nanval);
}
}
template<typename Ti, typename To, af_op_t op, int dim>
- void reduce_dim(Param out, Param in)
+ void reduce_dim(Param out, Param in, int change_nan, double nanval)
{
uint threads_y = std::min(THREADS_Y, nextpow2(in.info.dims[dim]));
uint threads_x = THREADS_X;
@@ -136,15 +152,17 @@ namespace kernel
for (int k = dim + 1; k < 4; k++) tmp.info.strides[k] *= groups_all[dim];
}
- reduce_dim_fn<Ti, To, op, dim>(tmp, in, threads_y, groups_all);
+ reduce_dim_fn<Ti, To, op, dim>(tmp, in, threads_y, groups_all, change_nan, nanval);
if (groups_all[dim] > 1) {
groups_all[dim] = 1;
if (op == af_notzero_t) {
- reduce_dim_fn<To, To, af_add_t, dim>(out, tmp, threads_y, groups_all);
+ reduce_dim_fn<To, To, af_add_t, dim>(out, tmp, threads_y, groups_all,
+ change_nan, nanval);
} else {
- reduce_dim_fn<To, To, op, dim>(out, tmp, threads_y, groups_all);
+ reduce_dim_fn<To, To, op, dim>(out, tmp, threads_y, groups_all,
+ change_nan, nanval);
}
bufferFree(tmp.data);
}
@@ -154,7 +172,8 @@ namespace kernel
template<typename Ti, typename To, af_op_t op, int threads_x>
void reduce_first_launcher(Param out, Param in,
const uint groups_x,
- const uint groups_y)
+ const uint groups_y,
+ int change_nan, double nanval)
{
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static std::map<int, Program*> reduceProgs;
@@ -197,11 +216,12 @@ namespace kernel
auto reduceOp = make_kernel<Buffer, KParam,
Buffer, KParam,
- uint, uint, uint>(*reduceKerns[device]);
+ uint, uint, uint,
+ int, To>(*reduceKerns[device]);
reduceOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info,
- *in.data, in.info, groups_x, groups_y, repeat);
+ *in.data, in.info, groups_x, groups_y, repeat, change_nan, scalar<To>(nanval));
CL_DEBUG_FINISH(getQueue());
}
@@ -210,24 +230,25 @@ namespace kernel
void reduce_first_fn(Param out, Param in,
const uint groups_x,
const uint groups_y,
- const uint threads_x)
+ const uint threads_x,
+ int change_nan, double nanval)
{
switch(threads_x) {
case 32: return reduce_first_launcher<Ti, To, op, 32>(out, in, groups_x,
- groups_y);
+ groups_y, change_nan, nanval);
case 64: return reduce_first_launcher<Ti, To, op, 64>(out, in, groups_x,
- groups_y);
+ groups_y, change_nan, nanval);
case 128: return reduce_first_launcher<Ti, To, op, 128>(out, in, groups_x,
- groups_y);
+ groups_y, change_nan, nanval);
case 256: return reduce_first_launcher<Ti, To, op, 256>(out, in, groups_x,
- groups_y);
+ groups_y, change_nan, nanval);
case 512: return reduce_first_launcher<Ti, To, op, 512>(out, in, groups_x,
- groups_y);
+ groups_y, change_nan, nanval);
}
}
template<typename Ti, typename To, af_op_t op>
- void reduce_first(Param out, Param in)
+ void reduce_first(Param out, Param in, int change_nan, double nanval)
{
uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0]));
threads_x = std::min(threads_x, THREADS_PER_GROUP);
@@ -249,15 +270,15 @@ namespace kernel
for (int k = 1; k < 4; k++) tmp.info.strides[k] *= groups_x;
}
- reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x);
+ reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
if (groups_x > 1) {
//FIXME: Is there an alternative to the if condition ?
if (op == af_notzero_t) {
- reduce_first_fn<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x);
+ reduce_first_fn<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
} else {
- reduce_first_fn<To, To, op>(out, tmp, 1, groups_y, threads_x);
+ reduce_first_fn<To, To, op>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
}
bufferFree(tmp.data);
@@ -265,14 +286,14 @@ namespace kernel
}
template<typename Ti, typename To, af_op_t op>
- void reduce(Param out, Param in, int dim)
+ void reduce(Param out, Param in, int dim, int change_nan, double nanval)
{
try {
switch (dim) {
- case 0: return reduce_first<Ti, To, op >(out, in);
- case 1: return reduce_dim <Ti, To, op, 1>(out, in);
- case 2: return reduce_dim <Ti, To, op, 2>(out, in);
- case 3: return reduce_dim <Ti, To, op, 3>(out, in);
+ case 0: return reduce_first<Ti, To, op >(out, in, change_nan, nanval);
+ case 1: return reduce_dim <Ti, To, op, 1>(out, in, change_nan, nanval);
+ case 2: return reduce_dim <Ti, To, op, 2>(out, in, change_nan, nanval);
+ case 3: return reduce_dim <Ti, To, op, 3>(out, in, change_nan, nanval);
}
} catch(cl::Error ex) {
CL_TO_AF_ERROR(ex);
@@ -280,7 +301,7 @@ namespace kernel
}
template<typename Ti, typename To, af_op_t op>
- To reduce_all(Param in)
+ To reduce_all(Param in, int change_nan, double nanval)
{
try {
int in_elements = in.info.dims[3] * in.info.strides[3];
@@ -321,7 +342,7 @@ namespace kernel
int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
tmp.data = bufferAlloc(tmp_elements * sizeof(To));
- reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x);
+ reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
unique_ptr<To> h_ptr(new To[tmp_elements]);
getQueue().enqueueReadBuffer(*tmp.data, CL_TRUE, 0, sizeof(To) * tmp_elements, h_ptr.get());
@@ -343,9 +364,12 @@ namespace kernel
Transform<Ti, To, op> transform;
Binary<To, op> reduce;
To out = reduce.init();
+ To nanval_to = scalar<To>(nanval);
for (int i = 0; i < (int)in_elements; i++) {
- out = reduce(out, transform(h_ptr.get()[i]));
+ To in_val = transform(h_ptr.get()[i]);
+ if (change_nan) in_val = IS_NAN(in_val) ? nanval_to : in_val;
+ out = reduce(out, in_val);
}
return out;
diff --git a/src/backend/opencl/kernel/reduce_dim.cl b/src/backend/opencl/kernel/reduce_dim.cl
index 5ac9f22..012661c 100644
--- a/src/backend/opencl/kernel/reduce_dim.cl
+++ b/src/backend/opencl/kernel/reduce_dim.cl
@@ -12,7 +12,8 @@ void reduce_dim_kernel(__global To *oData,
KParam oInfo,
const __global Ti *iData,
KParam iInfo,
- uint groups_x, uint groups_y, uint group_dim)
+ uint groups_x, uint groups_y, uint group_dim,
+ int change_nan, To nanval)
{
const uint lidx = get_local_id(0);
const uint lidy = get_local_id(1);
@@ -54,6 +55,7 @@ void reduce_dim_kernel(__global To *oData,
id += group_dim * get_local_size(1)) {
To in_val = transform(*iData);
+ if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
out_val = binOp(in_val, out_val);
iData = iData + group_dim * get_local_size(1) * istride_dim;
}
diff --git a/src/backend/opencl/kernel/reduce_first.cl b/src/backend/opencl/kernel/reduce_first.cl
index 8349048..16dcf9d 100644
--- a/src/backend/opencl/kernel/reduce_first.cl
+++ b/src/backend/opencl/kernel/reduce_first.cl
@@ -12,7 +12,8 @@ void reduce_first_kernel(__global To *oData,
KParam oInfo,
const __global Ti *iData,
KParam iInfo,
- uint groups_x, uint groups_y, uint repeat)
+ uint groups_x, uint groups_y, uint repeat,
+ int change_nan, To nanval)
{
const uint lidx = get_local_id(0);
const uint lidy = get_local_id(1);
@@ -40,6 +41,7 @@ void reduce_first_kernel(__global To *oData,
for (int id = xid; cond && id < lim; id += DIMX) {
To in_val = transform(iData[id]);
+ if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
out_val = binOp(in_val, out_val);
}
diff --git a/src/backend/opencl/reduce.hpp b/src/backend/opencl/reduce.hpp
index 6caa724..0ddc765 100644
--- a/src/backend/opencl/reduce.hpp
+++ b/src/backend/opencl/reduce.hpp
@@ -7,6 +7,7 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
#include <af/array.h>
#include <Array.hpp>
#include <ops.hpp>
@@ -14,8 +15,8 @@
namespace opencl
{
template<af_op_t op, typename Ti, typename To>
- Array<To> reduce(const Array<Ti> &in, const int dim);
+ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan=false, double nanval=0);
template<af_op_t op, typename Ti, typename To>
- To reduce_all(const Array<Ti> &in);
+ To reduce_all(const Array<Ti> &in, bool change_nan=false, double nanval=0);
}
diff --git a/src/backend/opencl/reduce_impl.hpp b/src/backend/opencl/reduce_impl.hpp
index a031039..a6e8efb 100644
--- a/src/backend/opencl/reduce_impl.hpp
+++ b/src/backend/opencl/reduce_impl.hpp
@@ -21,22 +21,23 @@ using af::dim4;
namespace opencl
{
template<af_op_t op, typename Ti, typename To>
- Array<To> reduce(const Array<Ti> &in, const int dim)
+ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
{
dim4 odims = in.dims();
odims[dim] = 1;
Array<To> out = createEmptyArray<To>(odims);
- kernel::reduce<Ti, To, op>(out, in, dim);
+ kernel::reduce<Ti, To, op>(out, in, dim, change_nan, nanval);
return out;
}
template<af_op_t op, typename Ti, typename To>
- To reduce_all(const Array<Ti> &in)
+ To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
{
- return kernel::reduce_all<Ti, To, op>(in);
+ return kernel::reduce_all<Ti, To, op>(in, change_nan, nanval);
}
}
#define INSTANTIATE(Op, Ti, To) \
- template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim); \
- template To reduce_all<Op, Ti, To>(const Array<Ti> &in);
+ template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim, \
+ bool change_nan, double nanval); \
+ template To reduce_all<Op, Ti, To>(const Array<Ti> &in, bool change_nan, double nanval);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list