[arrayfire] 87/408: FEAT: Added support to substitute nan values for sum and product

Mon Sep 21 19:11:24 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository arrayfire.

commit 3b2ce09be4b669353df6d4737609b423d7906ccd
Author: Pavan Yalamanchili <pavan at arrayfire.com>
Date:   Thu Jul 2 15:03:31 2015 -0400

    FEAT: Added support to substitute nan values for sum and product
---
 include/af/algorithm.h                    | 126 ++++++++++++++++++++++++++++++
 src/api/c/reduce.cpp                      |  71 +++++++++++------
 src/api/cpp/reduce.cpp                    |  81 ++++++++++++++-----
 src/backend/cpu/reduce.cpp                |  35 +++++----
 src/backend/cpu/reduce.hpp                |   6 +-
 src/backend/cuda/kernel/reduce.hpp        |  88 +++++++++++----------
 src/backend/cuda/reduce.hpp               |   6 +-
 src/backend/cuda/reduce_impl.hpp          |  13 +--
 src/backend/opencl/kernel/reduce.hpp      |  94 +++++++++++++---------
 src/backend/opencl/kernel/reduce_dim.cl   |   4 +-
 src/backend/opencl/kernel/reduce_first.cl |   4 +-
 src/backend/opencl/reduce.hpp             |   5 +-
 src/backend/opencl/reduce_impl.hpp        |  13 +--
 13 files changed, 388 insertions(+), 158 deletions(-)

diff --git a/include/af/algorithm.h b/include/af/algorithm.h
index a4d8833..e274d17 100644
--- a/include/af/algorithm.h
+++ b/include/af/algorithm.h
@@ -29,6 +29,19 @@ namespace af
     AFAPI array sum(const array &in, const int dim = -1);
 
     /**
+       C++ Interface for sum of elements in an array while replacing nan values
+
+       \param[in] in is the input array
+       \param[in] dim The dimension along which the add operation occurs
+       \param[in] nanval Replace nans with the value passed to this function
+       \return    result of sum all values along dimension \p dim
+
+       \ingroup reduce_func_sum
+
+    */
+    AFAPI array sum(const array &in, const int dim, const double nanval);
+
+    /**
        C++ Interface for product of elements in an array
 
        \param[in] in is the input array
@@ -42,6 +55,20 @@ namespace af
     AFAPI array product(const array &in, const int dim = -1);
 
     /**
+       C++ Interface for product of elements in an array while replacing nan values
+
+       \param[in] in is the input array
+       \param[in] dim The dimension along which the add operation occurs
+       \param[in] nanval Replace nans with the value passed to this function
+       \return    result of product all values along dimension \p dim
+
+       \ingroup reduce_func_product
+
+    */
+    AFAPI array product(const array &in, const int dim, const double nanval);
+
+
+    /**
        C++ Interface for minimum values in an array
 
        \param[in] in is the input array
@@ -51,6 +78,7 @@ namespace af
        \ingroup reduce_func_min
 
        \note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+       \note NaN values are ignored
     */
     AFAPI array min(const array &in, const int dim = -1);
 
@@ -64,6 +92,7 @@ namespace af
        \ingroup reduce_func_max
 
        \note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+       \note NaN values are ignored
     */
     AFAPI array max(const array &in, const int dim = -1);
 
@@ -77,6 +106,7 @@ namespace af
        \ingroup reduce_func_all_true
 
        \note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+       \note NaN values are ignored
     */
     AFAPI array allTrue(const array &in, const int dim = -1);
 
@@ -90,6 +120,7 @@ namespace af
        \ingroup reduce_func_any_true
 
        \note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+       \note NaN values are ignored
     */
     AFAPI array anyTrue(const array &in, const int dim = -1);
 
@@ -103,6 +134,7 @@ namespace af
        \ingroup reduce_func_count
 
        \note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+       \note NaN values are treated as non zero.
     */
     AFAPI array count(const array &in, const int dim = -1);
 
@@ -117,6 +149,17 @@ namespace af
     template<typename T> T sum(const array &in);
 
     /**
+       C++ Interface for sum of all elements in an array while replacing nan values
+
+       \param[in] in is the input array
+       \param[in] nanval  Replace nans with the value passed to this function
+       \return    the sum of all values of \p in
+
+       \ingroup reduce_func_sum
+    */
+    template<typename T> T sum(const array &in, double nanval);
+
+    /**
        C++ Interface for product of all elements in an array
 
        \param[in] in is the input array
@@ -127,12 +170,25 @@ namespace af
     template<typename T> T product(const array &in);
 
     /**
+       C++ Interface for product of all elements in an array while replacing nan values
+
+       \param[in] in is the input array
+       \param[in] nanval  Replace nans with the value passed to this function
+       \return    the product of all values of \p in
+
+       \ingroup reduce_func_product
+    */
+    template<typename T> T product(const array &in, double nanval);
+
+    /**
        C++ Interface for getting minimum value of an array
 
        \param[in] in is the input array
        \return    the minimum of all values of \p in
 
        \ingroup reduce_func_min
+
+       \note NaN values are ignored
     */
     template<typename T> T min(const array &in);
 
@@ -143,6 +199,8 @@ namespace af
        \return    the maximum of all values of \p in
 
        \ingroup reduce_func_max
+
+       \note NaN values are ignored
     */
     template<typename T> T max(const array &in);
 
@@ -153,6 +211,8 @@ namespace af
        \return    true if all values of \p in are true, false otherwise
 
        \ingroup reduce_func_all_true
+
+       \note NaN values are ignored
     */
     template<typename T> T allTrue(const array &in);
 
@@ -163,6 +223,8 @@ namespace af
        \return    true if any values of \p in are true, false otherwise
 
        \ingroup reduce_func_any_true
+
+       \note NaN values are ignored
     */
     template<typename T> T anyTrue(const array &in);
 
@@ -173,6 +235,8 @@ namespace af
        \return    the number of non-zero values in \p in
 
        \ingroup reduce_func_count
+
+       \note NaN values are treated as non zero
     */
     template<typename T> T count(const array &in);
 
@@ -187,6 +251,8 @@ namespace af
        \ingroup reduce_func_min
 
        \note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+
+       \note NaN values are ignored
     */
     AFAPI void min(array &val, array &idx, const array &in, const int dim = -1);
 
@@ -201,6 +267,8 @@ namespace af
        \ingroup reduce_func_max
 
        \note \p dim is -1 by default. -1 denotes the first non-singleton dimension.
+
+       \note NaN values are ignored
     */
     AFAPI void max(array &val, array &idx, const array &in, const int dim = -1);
 
@@ -212,6 +280,8 @@ namespace af
        \param[in]  in is the input array
 
        \ingroup reduce_func_min
+
+       \note NaN values are ignored
     */
     template<typename T> void min(T *val, unsigned *idx, const array &in);
 
@@ -223,6 +293,8 @@ namespace af
        \param[in]  in is the input array
 
        \ingroup reduce_func_max
+
+       \note NaN values are ignored
     */
     template<typename T> void max(T *val, unsigned *idx, const array &in);
 
@@ -369,6 +441,19 @@ extern "C" {
     AFAPI af_err af_sum(af_array *out, const af_array in, const int dim);
 
     /**
+       C Interface for sum of elements in an array while replacing nans
+
+       \param[out] out will contain the sum of all values in \p in along \p dim
+       \param[in] in is the input array
+       \param[in] dim The dimension along which the add operation occurs
+       \param[in] nanval Replace nans with the value passed to this function
+       \return \ref AF_SUCCESS if the execution completes properly
+
+       \ingroup reduce_func_sum
+    */
+    AFAPI af_err af_sum_nan(af_array *out, const af_array in, const int dim, const double nanval);
+
+    /**
        C Interface for product of elements in an array
 
        \param[out] out will contain the product of all values in \p in along \p dim
@@ -381,6 +466,19 @@ extern "C" {
     AFAPI af_err af_product(af_array *out, const af_array in, const int dim);
 
     /**
+       C Interface for product of elements in an array while replacing nans
+
+       \param[out] out will contain the product of all values in \p in along \p dim
+       \param[in] in is the input array
+       \param[in] dim The dimension along which the add operation occurs
+       \param[in] nanval Replace nans with the value passed to this function
+       \return \ref AF_SUCCESS if the execution completes properly
+
+       \ingroup reduce_func_product
+    */
+    AFAPI af_err af_product_nan(af_array *out, const af_array in, const int dim, const double nanval);
+
+    /**
        C Interface for minimum values in an array
 
        \param[out] out will contain the minimum of all values in \p in along \p dim
@@ -455,6 +553,20 @@ extern "C" {
     AFAPI af_err af_sum_all(double *real, double *imag, const af_array in);
 
     /**
+       C Interface for sum of all elements in an array while replacing nans
+
+       \param[out] real will contain the real part of adding all elements in input \p in
+       \param[out] imag will contain the imaginary part of adding all elements in input \p in
+       \param[in] in is the input array
+       \return \ref AF_SUCCESS if the execution completes properly
+
+       \note \p imag is always set to 0 when \p in is real
+
+       \ingroup reduce_func_sum
+    */
+    AFAPI af_err af_sum_nan_all(double *real, double *imag, const af_array in, const double nanval);
+
+    /**
        C Interface for product of all elements in an array
 
        \param[out] real will contain the real part of multiplying all elements in input \p in
@@ -469,6 +581,20 @@ extern "C" {
     AFAPI af_err af_product_all(double *real, double *imag, const af_array in);
 
     /**
+       C Interface for product of all elements in an array while replacing nans
+
+       \param[out] real will contain the real part of adding all elements in input \p in
+       \param[out] imag will contain the imaginary part of adding all elements in input \p in
+       \param[in] in is the input array
+       \return \ref AF_SUCCESS if the execution completes properly
+
+       \note \p imag is always set to 0 when \p in is real
+
+       \ingroup reduce_func_product
+    */
+    AFAPI af_err af_product_nan_all(double *real, double *imag, const af_array in, const double nanval);
+
+    /**
        C Interface for getting minimum value of an array
 
        \param[out] real will contain the real part of minimum value of all elements in input \p in
diff --git a/src/api/c/reduce.cpp b/src/api/c/reduce.cpp
index f4d5e61..cedf4f9 100644
--- a/src/api/c/reduce.cpp
+++ b/src/api/c/reduce.cpp
@@ -23,9 +23,10 @@ using af::dim4;
 using namespace detail;
 
 template<af_op_t op, typename Ti, typename To>
-static inline af_array reduce(const af_array in, const int dim)
+static inline af_array reduce(const af_array in, const int dim,
+                              bool change_nan = false, double nanval = 0)
 {
-    return getHandle(reduce<op,Ti,To>(getArray<Ti>(in), dim));
+    return getHandle(reduce<op,Ti,To>(getArray<Ti>(in), dim, change_nan, nanval));
 }
 
 template<af_op_t op, typename To>
@@ -107,7 +108,8 @@ static af_err reduce_common(af_array *out, const af_array in, const int dim)
 }
 
 template<af_op_t op>
-static af_err reduce_promote(af_array *out, const af_array in, const int dim)
+static af_err reduce_promote(af_array *out, const af_array in, const int dim,
+                             bool change_nan=false, double nanval=0)
 {
     try {
 
@@ -125,17 +127,17 @@ static af_err reduce_promote(af_array *out, const af_array in, const int dim)
         af_array res;
 
         switch(type) {
-        case f32:  res = reduce<op, float  , float  >(in, dim); break;
-        case f64:  res = reduce<op, double , double >(in, dim); break;
-        case c32:  res = reduce<op, cfloat , cfloat >(in, dim); break;
-        case c64:  res = reduce<op, cdouble, cdouble>(in, dim); break;
-        case u32:  res = reduce<op, uint   , uint   >(in, dim); break;
-        case s32:  res = reduce<op, int    , int    >(in, dim); break;
-        case u64:  res = reduce<op, uintl  , uintl  >(in, dim); break;
-        case s64:  res = reduce<op, intl   , intl   >(in, dim); break;
-        case u8:   res = reduce<op, uchar  , uint   >(in, dim); break;
+        case f32:  res = reduce<op, float  , float  >(in, dim, change_nan, nanval); break;
+        case f64:  res = reduce<op, double , double >(in, dim, change_nan, nanval); break;
+        case c32:  res = reduce<op, cfloat , cfloat >(in, dim, change_nan, nanval); break;
+        case c64:  res = reduce<op, cdouble, cdouble>(in, dim, change_nan, nanval); break;
+        case u32:  res = reduce<op, uint   , uint   >(in, dim, change_nan, nanval); break;
+        case s32:  res = reduce<op, int    , int    >(in, dim, change_nan, nanval); break;
+        case u64:  res = reduce<op, uintl  , uintl  >(in, dim, change_nan, nanval); break;
+        case s64:  res = reduce<op, intl   , intl   >(in, dim, change_nan, nanval); break;
+        case u8:   res = reduce<op, uchar  , uint   >(in, dim, change_nan, nanval); break;
             // Make sure you are adding only "1" for every non zero value, even if op == af_add_t
-        case b8:   res = reduce<af_notzero_t, char  , uint   >(in, dim); break;
+        case b8:   res = reduce<af_notzero_t, char  , uint   >(in, dim, change_nan, nanval); break;
         default:   TYPE_ERROR(1, type);
         }
         std::swap(*out, res);
@@ -165,6 +167,16 @@ af_err af_product(af_array *out, const af_array in, const int dim)
     return reduce_promote<af_mul_t>(out, in, dim);
 }
 
+af_err af_sum_nan(af_array *out, const af_array in, const int dim, const double nanval)
+{
+    return reduce_promote<af_add_t>(out, in, dim, true, nanval);
+}
+
+af_err af_product_nan(af_array *out, const af_array in, const int dim, const double nanval)
+{
+    return reduce_promote<af_mul_t>(out, in, dim, true, nanval);
+}
+
 af_err af_count(af_array *out, const af_array in, const int dim)
 {
     return reduce_type<af_notzero_t, uint>(out, in, dim);
@@ -181,9 +193,9 @@ af_err af_any_true(af_array *out, const af_array in, const int dim)
 }
 
 template<af_op_t op, typename Ti, typename To>
-static inline To reduce_all(const af_array in)
+static inline To reduce_all(const af_array in, bool change_nan = false, double nanval = 0)
 {
-    return reduce_all<op,Ti,To>(getArray<Ti>(in));
+    return reduce_all<op,Ti,To>(getArray<Ti>(in), change_nan, nanval);
 }
 
 template<af_op_t op, typename To>
@@ -267,7 +279,8 @@ static af_err reduce_all_common(double *real_val, double *imag_val, const af_arr
 }
 
 template<af_op_t op>
-static af_err reduce_all_promote(double *real_val, double *imag_val, const af_array in)
+static af_err reduce_all_promote(double *real_val, double *imag_val, const af_array in,
+                                 bool change_nan=false, double nanval=0)
 {
     try {
 
@@ -282,15 +295,15 @@ static af_err reduce_all_promote(double *real_val, double *imag_val, const af_ar
         cdouble cdval;
 
         switch(type) {
-        case f32: *real_val = (double)reduce_all<op, float  , float  >(in); break;
-        case f64: *real_val = (double)reduce_all<op, double , double >(in); break;
-        case u32: *real_val = (double)reduce_all<op, uint   , uint   >(in); break;
-        case s32: *real_val = (double)reduce_all<op, int    , int    >(in); break;
-        case u64: *real_val = (double)reduce_all<op, uintl  , uintl  >(in); break;
-        case s64: *real_val = (double)reduce_all<op, intl   , intl   >(in); break;
-        case u8:  *real_val = (double)reduce_all<op, uchar  , uint   >(in); break;
+        case f32: *real_val = (double)reduce_all<op, float  , float  >(in, change_nan, nanval); break;
+        case f64: *real_val = (double)reduce_all<op, double , double >(in, change_nan, nanval); break;
+        case u32: *real_val = (double)reduce_all<op, uint   , uint   >(in, change_nan, nanval); break;
+        case s32: *real_val = (double)reduce_all<op, int    , int    >(in, change_nan, nanval); break;
+        case u64: *real_val = (double)reduce_all<op, uintl  , uintl  >(in, change_nan, nanval); break;
+        case s64: *real_val = (double)reduce_all<op, intl   , intl   >(in, change_nan, nanval); break;
+        case u8:  *real_val = (double)reduce_all<op, uchar  , uint   >(in, change_nan, nanval); break;
             // Make sure you are adding only "1" for every non zero value, even if op == af_add_t
-        case b8:  *real_val = (double)reduce_all<af_notzero_t, char  , uint   >(in); break;
+        case b8:  *real_val = (double)reduce_all<af_notzero_t, char, uint>(in, change_nan, nanval); break;
 
         case c32:
             cfval = reduce_all<op, cfloat, cfloat>(in);
@@ -479,3 +492,13 @@ af_err af_imax_all(double *real, double *imag, unsigned *idx, const af_array in)
 {
     return ireduce_all_common<af_max_t>(real, imag, idx, in);
 }
+
+af_err af_sum_nan_all(double *real, double *imag, const af_array in, const double nanval)
+{
+    return reduce_all_promote<af_add_t>(real, imag, in, true, nanval);
+}
+
+af_err af_product_nan_all(double *real, double *imag, const af_array in, const double nanval)
+{
+    return reduce_all_promote<af_mul_t>(real, imag, in, true, nanval);
+}
diff --git a/src/api/cpp/reduce.cpp b/src/api/cpp/reduce.cpp
index 63d0200..7be05bb 100644
--- a/src/api/cpp/reduce.cpp
+++ b/src/api/cpp/reduce.cpp
@@ -22,6 +22,13 @@ namespace af
         return array(out);
     }
 
+    array sum(const array &in, const int dim, const double nanval)
+    {
+        af_array out = 0;
+        AF_THROW(af_sum_nan(&out, in.get(), dim, nanval));
+        return array(out);
+    }
+
     array product(const array &in, const int dim)
     {
         af_array out = 0;
@@ -29,6 +36,13 @@ namespace af
         return array(out);
     }
 
+    array product(const array &in, const int dim, const double nanval)
+    {
+        af_array out = 0;
+        AF_THROW(af_product_nan(&out, in.get(), dim, nanval));
+        return array(out);
+    }
+
     array mul(const array &in, const int dim)
     {
         return product(in, dim);
@@ -91,6 +105,21 @@ namespace af
         idx = array(loc);
     }
 
+
+#define INSTANTIATE(fnC, fnCPP)                         \
+    INSTANTIATE_REAL(fnC, fnCPP, float)                 \
+    INSTANTIATE_REAL(fnC, fnCPP, double)                \
+    INSTANTIATE_REAL(fnC, fnCPP, int)                   \
+    INSTANTIATE_REAL(fnC, fnCPP, unsigned)              \
+    INSTANTIATE_REAL(fnC, fnCPP, long)                  \
+    INSTANTIATE_REAL(fnC, fnCPP, unsigned long)         \
+    INSTANTIATE_REAL(fnC, fnCPP, long long)             \
+    INSTANTIATE_REAL(fnC, fnCPP, unsigned long long)    \
+    INSTANTIATE_REAL(fnC, fnCPP, char)                  \
+    INSTANTIATE_REAL(fnC, fnCPP, unsigned char)         \
+    INSTANTIATE_CPLX(fnC, fnCPP, af_cfloat, float)      \
+    INSTANTIATE_CPLX(fnC, fnCPP, af_cdouble, double)    \
+
 #define INSTANTIATE_REAL(fnC, fnCPP, T)                     \
     template<> AFAPI                                        \
     T fnCPP(const array &in)                                \
@@ -111,20 +140,6 @@ namespace af
         return out;                                         \
     }                                                       \
 
-#define INSTANTIATE(fnC, fnCPP)                         \
-    INSTANTIATE_REAL(fnC, fnCPP, float)                 \
-    INSTANTIATE_REAL(fnC, fnCPP, double)                \
-    INSTANTIATE_REAL(fnC, fnCPP, int)                   \
-    INSTANTIATE_REAL(fnC, fnCPP, unsigned)              \
-    INSTANTIATE_REAL(fnC, fnCPP, long)                  \
-    INSTANTIATE_REAL(fnC, fnCPP, unsigned long)         \
-    INSTANTIATE_REAL(fnC, fnCPP, long long)             \
-    INSTANTIATE_REAL(fnC, fnCPP, unsigned long long)    \
-    INSTANTIATE_REAL(fnC, fnCPP, char)                  \
-    INSTANTIATE_REAL(fnC, fnCPP, unsigned char)         \
-    INSTANTIATE_CPLX(fnC, fnCPP, af_cfloat, float)      \
-    INSTANTIATE_CPLX(fnC, fnCPP, af_cdouble, double)    \
-
     INSTANTIATE(sum, sum)
     INSTANTIATE(product, product)
     INSTANTIATE(min, min)
@@ -136,15 +151,41 @@ namespace af
     INSTANTIATE_REAL(all_true, allTrue, bool);
     INSTANTIATE_REAL(any_true, anyTrue, bool);
 
-#undef INSTANTIATE
 #undef INSTANTIATE_REAL
 #undef INSTANTIATE_CPLX
 
-#define INSTANTIATE_COMPAT(fnCPP, fnCompat, T)              \
-    template<> AFAPI                                        \
-    T fnCompat(const array &in)                             \
-    {                                                       \
-        return fnCPP<T>(in);                                      \
+#define INSTANTIATE_REAL(fnC, fnCPP, T)                             \
+    template<> AFAPI                                                \
+    T fnCPP(const array &in, const double nanval)                   \
+    {                                                               \
+        double rval, ival;                                          \
+        AF_THROW(af_##fnC##_all(&rval, &ival, in.get(), nanval));   \
+        return (T)(rval);                                           \
+    }                                                               \
+
+
+#define INSTANTIATE_CPLX(fnC, fnCPP, T, Tr)                         \
+    template<> AFAPI                                                \
+    T fnCPP(const array &in, const double nanval)                   \
+    {                                                               \
+        double rval, ival;                                          \
+        AF_THROW(af_##fnC##_all(&rval, &ival, in.get(), nanval));   \
+        T out((Tr)rval, (Tr)ival);                                  \
+        return out;                                                 \
+    }                                                               \
+
+INSTANTIATE(sum_nan, sum)
+INSTANTIATE(product_nan, product)
+
+#undef INSTANTIATE_REAL
+#undef INSTANTIATE_CPLX
+#undef INSTANTIATE
+
+#define INSTANTIATE_COMPAT(fnCPP, fnCompat, T)  \
+    template<> AFAPI                            \
+    T fnCompat(const array &in)                 \
+    {                                           \
+        return fnCPP<T>(in);                    \
     }
 
 #define INSTANTIATE(fnCPP, fnCompat)                            \
diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp
index 428e5d9..5724508 100644
--- a/src/backend/cpu/reduce.cpp
+++ b/src/backend/cpu/reduce.cpp
@@ -25,16 +25,16 @@ namespace cpu
     {
         void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
                         const Ti *in , const dim4 &istrides, const dim4 &idims,
-                        const int dim)
+                        const int dim, bool change_nan, double nanval)
         {
             static const int D1 = D - 1;
             static reduce_dim<op, Ti, To, D1> reduce_dim_next;
             for (dim_t i = 0; i < odims[D1]; i++) {
-                 reduce_dim_next(out + i * ostrides[D1],
-                                 ostrides, odims,
-                                 in  + i * istrides[D1],
-                                 istrides, idims,
-                                 dim);
+                reduce_dim_next(out + i * ostrides[D1],
+                                ostrides, odims,
+                                in  + i * istrides[D1],
+                                istrides, idims,
+                                dim, change_nan, nanval);
             }
         }
     };
@@ -47,13 +47,14 @@ namespace cpu
         Binary<To, op> reduce;
         void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
                         const Ti *in , const dim4 &istrides, const dim4 &idims,
-                        const int dim)
+                        const int dim, bool change_nan, double nanval)
         {
             dim_t stride = istrides[dim];
 
             To out_val = reduce.init();
             for (dim_t i = 0; i < idims[dim]; i++) {
                 To in_val = transform(in[i * stride]);
+                if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
                 out_val = reduce(in_val, out_val);
             }
 
@@ -64,10 +65,10 @@ namespace cpu
     template<af_op_t op, typename Ti, typename To>
     using reduce_dim_func = std::function<void(To*,const dim4&, const dim4&,
                                                 const Ti*, const dim4&, const dim4&,
-                                                const int)>;
+                                                const int, bool, double)>;
 
     template<af_op_t op, typename Ti, typename To>
-    Array<To> reduce(const Array<Ti> &in, const int dim)
+    Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
     {
         dim4 odims = in.dims();
         odims[dim] = 1;
@@ -79,13 +80,14 @@ namespace cpu
                                                               , reduce_dim<op, Ti, To, 4>()};
 
         reduce_funcs[in.ndims() - 1](out.get(), out.strides(), out.dims(),
-                                    in.get(), in.strides(), in.dims(), dim);
+                                     in.get(), in.strides(), in.dims(), dim,
+                                     change_nan, nanval);
 
         return out;
     }
 
     template<af_op_t op, typename Ti, typename To>
-    To reduce_all(const Array<Ti> &in)
+    To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
     {
         Transform<Ti, To, op> transform;
         Binary<To, op> reduce;
@@ -109,8 +111,9 @@ namespace cpu
                     for(dim_t i = 0; i < dims[0]; i++) {
                         dim_t idx = i + off1 + off2 + off3;
 
-                        To val = transform(inPtr[idx]);
-                        out = reduce(val, out);
+                        To in_val = transform(inPtr[idx]);
+                        if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+                        out = reduce(in_val, out);
                     }
                 }
             }
@@ -120,8 +123,10 @@ namespace cpu
     }
 
 #define INSTANTIATE(ROp, Ti, To)                                        \
-    template Array<To> reduce<ROp, Ti, To>(const Array<Ti> &in, const int dim); \
-    template To reduce_all<ROp, Ti, To>(const Array<Ti> &in);
+    template Array<To> reduce<ROp, Ti, To>(const Array<Ti> &in, const int dim, \
+                                           bool change_nan, double nanval); \
+    template To reduce_all<ROp, Ti, To>(const Array<Ti> &in,            \
+                                        bool change_nan, double nanval);
 
     //min
     INSTANTIATE(af_min_t, float  , float  )
diff --git a/src/backend/cpu/reduce.hpp b/src/backend/cpu/reduce.hpp
index 039a47d..4e139f0 100644
--- a/src/backend/cpu/reduce.hpp
+++ b/src/backend/cpu/reduce.hpp
@@ -6,7 +6,7 @@
  * The complete license agreement can be obtained at:
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
-
+#pragma once
 #include <af/array.h>
 #include <Array.hpp>
 #include <ops.hpp>
@@ -14,8 +14,8 @@
 namespace cpu
 {
     template<af_op_t op, typename Ti, typename To>
-    Array<To> reduce(const Array<Ti> &in, const int dim);
+    Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan=false, double nanval=0);
 
     template<af_op_t op, typename Ti, typename To>
-    To reduce_all(const Array<Ti> &in);
+    To reduce_all(const Array<Ti> &in, bool change_nan=false, double nanval=0);
 }
diff --git a/src/backend/cuda/kernel/reduce.hpp b/src/backend/cuda/kernel/reduce.hpp
index a5961f3..0263142 100644
--- a/src/backend/cuda/kernel/reduce.hpp
+++ b/src/backend/cuda/kernel/reduce.hpp
@@ -29,7 +29,8 @@ namespace kernel
     __global__
     static void reduce_dim_kernel(Param<To> out,
                                   CParam <Ti> in,
-                                  uint blocks_x, uint blocks_y, uint offset_dim)
+                                  uint blocks_x, uint blocks_y, uint offset_dim,
+                                  bool change_nan, To nanval)
     {
         const uint tidx = threadIdx.x;
         const uint tidy = threadIdx.y;
@@ -73,6 +74,7 @@ namespace kernel
         To out_val = reduce.init();
         for (int id = id_dim_in; is_valid && (id < in.dims[dim]); id += offset_dim * blockDim.y) {
             To in_val = transform(*iptr);
+            if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
             out_val = reduce(in_val, out_val);
             iptr = iptr + offset_dim * blockDim.y * istride_dim;
         }
@@ -106,7 +108,8 @@ namespace kernel
 
     template<typename Ti, typename To, af_op_t op, int dim>
     void reduce_dim_launcher(Param<To> out, CParam<Ti> in,
-                             const uint threads_y, const uint blocks_dim[4])
+                             const uint threads_y, const uint blocks_dim[4],
+                             bool change_nan, double nanval)
     {
         dim3 threads(THREADS_X, threads_y);
 
@@ -116,23 +119,27 @@ namespace kernel
         switch (threads_y) {
         case 8:
             (reduce_dim_kernel<Ti, To, op, dim, 8>)<<<blocks, threads>>>(
-                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+                change_nan, scalar<To>(nanval)); break;
         case 4:
             (reduce_dim_kernel<Ti, To, op, dim, 4>)<<<blocks, threads>>>(
-                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+                change_nan, scalar<To>(nanval)); break;
         case 2:
             (reduce_dim_kernel<Ti, To, op, dim, 2>)<<<blocks, threads>>>(
-                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+                change_nan, scalar<To>(nanval)); break;
         case 1:
             (reduce_dim_kernel<Ti, To, op, dim, 1>)<<<blocks, threads>>>(
-                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim]); break;
+                out, in, blocks_dim[0], blocks_dim[1], blocks_dim[dim],
+                change_nan, scalar<To>(nanval)); break;
         }
 
         POST_LAUNCH_CHECK();
     }
 
     template<typename Ti, typename To, af_op_t op, int dim>
-    void reduce_dim(Param<To> out,  CParam<Ti> in)
+    void reduce_dim(Param<To> out,  CParam<Ti> in, bool change_nan, double nanval)
     {
         uint threads_y = std::min(THREADS_Y, nextpow2(in.dims[dim]));
         uint threads_x = THREADS_X;
@@ -154,15 +161,17 @@ namespace kernel
             for (int k = dim + 1; k < 4; k++) tmp.strides[k] *= blocks_dim[dim];
         }
 
-        reduce_dim_launcher<Ti, To, op, dim>(tmp, in, threads_y, blocks_dim);
+        reduce_dim_launcher<Ti, To, op, dim>(tmp, in, threads_y, blocks_dim, change_nan, nanval);
 
         if (blocks_dim[dim] > 1) {
             blocks_dim[dim] = 1;
 
             if (op == af_notzero_t) {
-                reduce_dim_launcher<To, To, af_add_t, dim>(out, tmp, threads_y, blocks_dim);
+                reduce_dim_launcher<To, To, af_add_t, dim>(out, tmp, threads_y, blocks_dim,
+                                                           change_nan, nanval);
             } else {
-                reduce_dim_launcher<To, To,       op, dim>(out, tmp, threads_y, blocks_dim);
+                reduce_dim_launcher<To, To,       op, dim>(out, tmp, threads_y, blocks_dim,
+                                                           change_nan, nanval);
             }
 
             memFree(tmp.ptr);
@@ -171,20 +180,6 @@ namespace kernel
     }
 
     template<typename To, af_op_t op>
-    __device__ void warp_reduce_sync(To *s_ptr, uint tidx)
-    {
-
-    }
-
-#if (__CUDA_ARCH__ >= 300)
-    template<typename To, af_op_t op>
-    __device__ void warp_reduce_shfl(To *s_ptr, uint tidx)
-    {
-
-    }
-#endif
-
-    template<typename To, af_op_t op>
     struct WarpReduce
     {
         __device__ To operator()(To *s_ptr, uint tidx)
@@ -230,7 +225,8 @@ namespace kernel
     __global__
     static void reduce_first_kernel(Param<To> out,
                                     CParam<Ti>  in,
-                                    uint blocks_x, uint blocks_y, uint repeat)
+                                    uint blocks_x, uint blocks_y, uint repeat,
+                                    bool change_nan, To nanval)
     {
         const uint tidx = threadIdx.x;
         const uint tidy = threadIdx.y;
@@ -263,6 +259,7 @@ namespace kernel
 
         for (int id = xid; id < lim; id += DIMX) {
             To in_val = transform(iptr[id]);
+            if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
             out_val = reduce(in_val, out_val);
         }
 
@@ -294,7 +291,8 @@ namespace kernel
 
     template<typename Ti, typename To, af_op_t op>
     void reduce_first_launcher(Param<To> out, CParam<Ti> in,
-                               const uint blocks_x, const uint blocks_y, const uint threads_x)
+                               const uint blocks_x, const uint blocks_y, const uint threads_x,
+                               bool change_nan, double nanval)
     {
 
         dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
@@ -306,23 +304,23 @@ namespace kernel
         switch (threads_x) {
         case 32:
             (reduce_first_kernel<Ti, To, op,  32>)<<<blocks, threads>>>(
-                out, in, blocks_x, blocks_y, repeat); break;
+                out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
         case 64:
             (reduce_first_kernel<Ti, To, op,  64>)<<<blocks, threads>>>(
-                out, in, blocks_x, blocks_y, repeat); break;
+                out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
         case 128:
             (reduce_first_kernel<Ti, To, op,  128>)<<<blocks, threads>>>(
-                out, in, blocks_x, blocks_y, repeat); break;
+                out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
         case 256:
             (reduce_first_kernel<Ti, To, op,  256>)<<<blocks, threads>>>(
-                out, in, blocks_x, blocks_y, repeat); break;
+                out, in, blocks_x, blocks_y, repeat, change_nan, scalar<To>(nanval)); break;
         }
 
         POST_LAUNCH_CHECK();
     }
 
     template<typename Ti, typename To, af_op_t op>
-    void reduce_first(Param<To> out, CParam<Ti> in)
+    void reduce_first(Param<To> out, CParam<Ti> in, bool change_nan, double nanval)
     {
         uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0]));
         threads_x = std::min(threads_x, THREADS_PER_BLOCK);
@@ -342,15 +340,17 @@ namespace kernel
             for (int k = 1; k < 4; k++) tmp.strides[k] *= blocks_x;
         }
 
-        reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x);
+        reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x, change_nan, nanval);
 
         if (blocks_x > 1) {
 
             //FIXME: Is there an alternative to the if condition ?
             if (op == af_notzero_t) {
-                reduce_first_launcher<To, To, af_add_t>(out, tmp, 1, blocks_y, threads_x);
+                reduce_first_launcher<To, To, af_add_t>(out, tmp, 1, blocks_y, threads_x,
+                                                        change_nan, nanval);
             } else {
-                reduce_first_launcher<To, To,       op>(out, tmp, 1, blocks_y, threads_x);
+                reduce_first_launcher<To, To,       op>(out, tmp, 1, blocks_y, threads_x,
+                                                        change_nan, nanval);
             }
 
             memFree(tmp.ptr);
@@ -358,18 +358,18 @@ namespace kernel
     }
 
     template<typename Ti, typename To, af_op_t op>
-    void reduce(Param<To> out, CParam<Ti> in, int dim)
+    void reduce(Param<To> out, CParam<Ti> in, int dim, bool change_nan, double nanval)
     {
         switch (dim) {
-        case 0: return reduce_first<Ti, To, op   >(out, in);
-        case 1: return reduce_dim  <Ti, To, op, 1>(out, in);
-        case 2: return reduce_dim  <Ti, To, op, 2>(out, in);
-        case 3: return reduce_dim  <Ti, To, op, 3>(out, in);
+        case 0: return reduce_first<Ti, To, op   >(out, in, change_nan, nanval);
+        case 1: return reduce_dim  <Ti, To, op, 1>(out, in, change_nan, nanval);
+        case 2: return reduce_dim  <Ti, To, op, 2>(out, in, change_nan, nanval);
+        case 3: return reduce_dim  <Ti, To, op, 3>(out, in, change_nan, nanval);
         }
     }
 
     template<typename Ti, typename To, af_op_t op>
-    To reduce_all(CParam<Ti> in)
+    To reduce_all(CParam<Ti> in, bool change_nan, double nanval)
     {
         int in_elements = in.strides[3] * in.dims[3];
 
@@ -409,7 +409,8 @@ namespace kernel
             int tmp_elements = tmp.strides[3] * tmp.dims[3];
 
             tmp.ptr = memAlloc<To>(tmp_elements);
-            reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x);
+            reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x,
+                                              change_nan, nanval);
 
             scoped_ptr<To> h_ptr(new To[tmp_elements]);
             To* h_ptr_raw = h_ptr.get();
@@ -434,9 +435,12 @@ namespace kernel
             Transform<Ti, To, op> transform;
             Binary<To, op> reduce;
             To out = reduce.init();
+            To nanval_to = scalar<To>(nanval);
 
             for (int i = 0; i < in_elements; i++) {
-                out = reduce(out, transform(h_ptr_raw[i]));
+                To in_val = transform(h_ptr_raw[i]);
+                if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval_to;
+                out = reduce(out, in_val);
             }
 
             return out;
diff --git a/src/backend/cuda/reduce.hpp b/src/backend/cuda/reduce.hpp
index 2af2f5e..82755bc 100644
--- a/src/backend/cuda/reduce.hpp
+++ b/src/backend/cuda/reduce.hpp
@@ -6,7 +6,7 @@
  * The complete license agreement can be obtained at:
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
-
+#pragma once
 #include <af/array.h>
 #include <Array.hpp>
 #include <ops.hpp>
@@ -14,8 +14,8 @@
 namespace cuda
 {
     template<af_op_t op, typename Ti, typename To>
-    Array<To> reduce(const Array<Ti> &in, const int dim);
+    Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan=false, double nanval=0);
 
     template<af_op_t op, typename Ti, typename To>
-    To reduce_all(const Array<Ti> &in);
+    To reduce_all(const Array<Ti> &in, bool change_nan=false, double nanval=0);
 }
diff --git a/src/backend/cuda/reduce_impl.hpp b/src/backend/cuda/reduce_impl.hpp
index abc6d5c..b1899f4 100644
--- a/src/backend/cuda/reduce_impl.hpp
+++ b/src/backend/cuda/reduce_impl.hpp
@@ -23,23 +23,24 @@ using af::dim4;
 namespace cuda
 {
     template<af_op_t op, typename Ti, typename To>
-    Array<To> reduce(const Array<Ti> &in, const int dim)
+    Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
     {
 
         dim4 odims = in.dims();
         odims[dim] = 1;
         Array<To> out = createEmptyArray<To>(odims);
-        kernel::reduce<Ti, To, op>(out, in, dim);
+        kernel::reduce<Ti, To, op>(out, in, dim, change_nan, nanval);
         return out;
     }
 
     template<af_op_t op, typename Ti, typename To>
-    To reduce_all(const Array<Ti> &in)
+    To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
     {
-        return kernel::reduce_all<Ti, To, op>(in);
+        return kernel::reduce_all<Ti, To, op>(in, change_nan, nanval);
     }
 }
 
 #define INSTANTIATE(Op, Ti, To)                                         \
-    template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim); \
-    template To reduce_all<Op, Ti, To>(const Array<Ti> &in);
+    template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim, \
+                                          bool change_nan, double nanval); \
+    template To reduce_all<Op, Ti, To>(const Array<Ti> &in, bool change_nan, double nanval);
diff --git a/src/backend/opencl/kernel/reduce.hpp b/src/backend/opencl/kernel/reduce.hpp
index 0d8be35..094b42f 100644
--- a/src/backend/opencl/kernel/reduce.hpp
+++ b/src/backend/opencl/kernel/reduce.hpp
@@ -42,7 +42,8 @@ namespace kernel
 
     template<typename Ti, typename To, af_op_t op, int dim, int threads_y>
     void reduce_dim_launcher(Param out, Param in,
-                       const uint groups_all[4])
+                             const uint groups_all[4],
+                             int change_nan, double nanval)
     {
         static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
         static std::map<int, Program*> reduceProgs;
@@ -84,34 +85,49 @@ namespace kernel
 
         auto reduceOp = make_kernel<Buffer, KParam,
                                     Buffer, KParam,
-                                    uint, uint, uint>(*reduceKerns[device]);
+                                    uint, uint, uint,
+                                    int, To>(*reduceKerns[device]);
 
         reduceOp(EnqueueArgs(getQueue(), global, local),
                  *out.data, out.info,
                  *in.data, in.info,
                  groups_all[0],
                  groups_all[1],
-                 groups_all[dim]);
+                 groups_all[dim],
+                 change_nan,
+                 scalar<To>(nanval));
 
         CL_DEBUG_FINISH(getQueue());
     }
 
     template<typename Ti, typename To, af_op_t op, int dim>
     void reduce_dim_fn(Param out, Param in,
-                       const uint threads_y, const uint groups_all[4])
+                       const uint threads_y, const uint groups_all[4],
+                       int change_nan, double nanval)
     {
         switch(threads_y) {
-        case 8: return reduce_dim_launcher<Ti, To, op, dim, 8>(out, in, groups_all);
-        case 4: return reduce_dim_launcher<Ti, To, op, dim, 4>(out, in, groups_all);
-        case 2: return reduce_dim_launcher<Ti, To, op, dim, 2>(out, in, groups_all);
-        case 1: return reduce_dim_launcher<Ti, To, op, dim, 1>(out, in, groups_all);
-        case 16: return reduce_dim_launcher<Ti, To, op, dim, 16>(out, in, groups_all);
-        case 32: return reduce_dim_launcher<Ti, To, op, dim, 32>(out, in, groups_all);
+        case  8: return reduce_dim_launcher<Ti, To, op, dim,  8>(out, in, groups_all,
+                                                                change_nan, nanval);
+
+        case  4: return reduce_dim_launcher<Ti, To, op, dim,  4>(out, in, groups_all,
+                                                                change_nan, nanval);
+
+        case  2: return reduce_dim_launcher<Ti, To, op, dim,  2>(out, in, groups_all,
+                                                                change_nan, nanval);
+
+        case  1: return reduce_dim_launcher<Ti, To, op, dim,  1>(out, in, groups_all,
+                                                                change_nan, nanval);
+
+        case 16: return reduce_dim_launcher<Ti, To, op, dim, 16>(out, in, groups_all,
+                                                                change_nan, nanval);
+
+        case 32: return reduce_dim_launcher<Ti, To, op, dim, 32>(out, in, groups_all,
+                                                                change_nan, nanval);
         }
     }
 
     template<typename Ti, typename To, af_op_t op, int dim>
-    void reduce_dim(Param out, Param in)
+    void reduce_dim(Param out, Param in, int change_nan, double nanval)
     {
         uint threads_y = std::min(THREADS_Y, nextpow2(in.info.dims[dim]));
         uint threads_x = THREADS_X;
@@ -136,15 +152,17 @@ namespace kernel
             for (int k = dim + 1; k < 4; k++) tmp.info.strides[k] *= groups_all[dim];
         }
 
-        reduce_dim_fn<Ti, To, op, dim>(tmp, in, threads_y, groups_all);
+        reduce_dim_fn<Ti, To, op, dim>(tmp, in, threads_y, groups_all, change_nan, nanval);
 
         if (groups_all[dim] > 1) {
             groups_all[dim] = 1;
 
             if (op == af_notzero_t) {
-                reduce_dim_fn<To, To, af_add_t, dim>(out, tmp, threads_y, groups_all);
+                reduce_dim_fn<To, To, af_add_t, dim>(out, tmp, threads_y, groups_all,
+                                                     change_nan, nanval);
             } else {
-                reduce_dim_fn<To, To,       op, dim>(out, tmp, threads_y, groups_all);
+                reduce_dim_fn<To, To,       op, dim>(out, tmp, threads_y, groups_all,
+                                                     change_nan, nanval);
             }
             bufferFree(tmp.data);
         }
@@ -154,7 +172,8 @@ namespace kernel
     template<typename Ti, typename To, af_op_t op, int threads_x>
     void reduce_first_launcher(Param out, Param in,
                                const uint groups_x,
-                               const uint groups_y)
+                               const uint groups_y,
+                               int change_nan, double nanval)
     {
         static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
         static std::map<int, Program*> reduceProgs;
@@ -197,11 +216,12 @@ namespace kernel
 
         auto reduceOp = make_kernel<Buffer, KParam,
                                     Buffer, KParam,
-                                    uint, uint, uint>(*reduceKerns[device]);
+                                    uint, uint, uint,
+                                    int, To>(*reduceKerns[device]);
 
         reduceOp(EnqueueArgs(getQueue(), global, local),
                  *out.data, out.info,
-                 *in.data, in.info, groups_x, groups_y, repeat);
+                 *in.data, in.info, groups_x, groups_y, repeat, change_nan, scalar<To>(nanval));
 
         CL_DEBUG_FINISH(getQueue());
     }
@@ -210,24 +230,25 @@ namespace kernel
     void reduce_first_fn(Param out, Param in,
                          const uint groups_x,
                          const uint groups_y,
-                         const uint threads_x)
+                         const uint threads_x,
+                         int change_nan, double nanval)
     {
         switch(threads_x) {
         case  32: return reduce_first_launcher<Ti, To, op,  32>(out, in, groups_x,
-                                                                groups_y);
+                                                                groups_y, change_nan, nanval);
         case  64: return reduce_first_launcher<Ti, To, op,  64>(out, in, groups_x,
-                                                                groups_y);
+                                                                groups_y, change_nan, nanval);
         case 128: return reduce_first_launcher<Ti, To, op, 128>(out, in, groups_x,
-                                                                groups_y);
+                                                                groups_y, change_nan, nanval);
         case 256: return reduce_first_launcher<Ti, To, op, 256>(out, in, groups_x,
-                                                                groups_y);
+                                                                groups_y, change_nan, nanval);
         case 512: return reduce_first_launcher<Ti, To, op, 512>(out, in, groups_x,
-                                                                groups_y);
+                                                                groups_y, change_nan, nanval);
         }
     }
 
     template<typename Ti, typename To, af_op_t op>
-    void reduce_first(Param out, Param in)
+    void reduce_first(Param out, Param in, int change_nan, double nanval)
     {
         uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0]));
         threads_x = std::min(threads_x, THREADS_PER_GROUP);
@@ -249,15 +270,15 @@ namespace kernel
             for (int k = 1; k < 4; k++) tmp.info.strides[k] *= groups_x;
         }
 
-        reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x);
+        reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
 
         if (groups_x > 1) {
 
             //FIXME: Is there an alternative to the if condition ?
             if (op == af_notzero_t) {
-                reduce_first_fn<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x);
+                reduce_first_fn<To, To, af_add_t>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
             } else {
-                reduce_first_fn<To, To,       op>(out, tmp, 1, groups_y, threads_x);
+                reduce_first_fn<To, To,       op>(out, tmp, 1, groups_y, threads_x, change_nan, nanval);
             }
 
             bufferFree(tmp.data);
@@ -265,14 +286,14 @@ namespace kernel
     }
 
     template<typename Ti, typename To, af_op_t op>
-    void reduce(Param out, Param in, int dim)
+    void reduce(Param out, Param in, int dim, int change_nan, double nanval)
     {
         try {
             switch (dim) {
-            case 0: return reduce_first<Ti, To, op   >(out, in);
-            case 1: return reduce_dim  <Ti, To, op, 1>(out, in);
-            case 2: return reduce_dim  <Ti, To, op, 2>(out, in);
-            case 3: return reduce_dim  <Ti, To, op, 3>(out, in);
+            case 0: return reduce_first<Ti, To, op   >(out, in, change_nan, nanval);
+            case 1: return reduce_dim  <Ti, To, op, 1>(out, in, change_nan, nanval);
+            case 2: return reduce_dim  <Ti, To, op, 2>(out, in, change_nan, nanval);
+            case 3: return reduce_dim  <Ti, To, op, 3>(out, in, change_nan, nanval);
             }
         } catch(cl::Error ex) {
             CL_TO_AF_ERROR(ex);
@@ -280,7 +301,7 @@ namespace kernel
     }
 
     template<typename Ti, typename To, af_op_t op>
-    To reduce_all(Param in)
+    To reduce_all(Param in, int change_nan, double nanval)
     {
         try {
             int in_elements = in.info.dims[3] * in.info.strides[3];
@@ -321,7 +342,7 @@ namespace kernel
                 int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
                 tmp.data = bufferAlloc(tmp_elements * sizeof(To));
 
-                reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x);
+                reduce_first_fn<Ti, To, op>(tmp, in, groups_x, groups_y, threads_x, change_nan, nanval);
 
                 unique_ptr<To> h_ptr(new To[tmp_elements]);
                 getQueue().enqueueReadBuffer(*tmp.data, CL_TRUE, 0, sizeof(To) * tmp_elements, h_ptr.get());
@@ -343,9 +364,12 @@ namespace kernel
                 Transform<Ti, To, op> transform;
                 Binary<To, op> reduce;
                 To out = reduce.init();
+                To nanval_to = scalar<To>(nanval);
 
                 for (int i = 0; i < (int)in_elements; i++) {
-                    out = reduce(out, transform(h_ptr.get()[i]));
+                    To in_val = transform(h_ptr.get()[i]);
+                    if (change_nan) in_val = IS_NAN(in_val) ? nanval_to : in_val;
+                    out = reduce(out, in_val);
                 }
 
                 return out;
diff --git a/src/backend/opencl/kernel/reduce_dim.cl b/src/backend/opencl/kernel/reduce_dim.cl
index 5ac9f22..012661c 100644
--- a/src/backend/opencl/kernel/reduce_dim.cl
+++ b/src/backend/opencl/kernel/reduce_dim.cl
@@ -12,7 +12,8 @@ void reduce_dim_kernel(__global To *oData,
                        KParam oInfo,
                        const __global Ti *iData,
                        KParam iInfo,
-                       uint groups_x, uint groups_y, uint group_dim)
+                       uint groups_x, uint groups_y, uint group_dim,
+                       int change_nan, To nanval)
 {
     const uint lidx = get_local_id(0);
     const uint lidy = get_local_id(1);
@@ -54,6 +55,7 @@ void reduce_dim_kernel(__global To *oData,
          id += group_dim * get_local_size(1)) {
 
         To in_val = transform(*iData);
+        if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
         out_val = binOp(in_val, out_val);
         iData = iData + group_dim * get_local_size(1) * istride_dim;
     }
diff --git a/src/backend/opencl/kernel/reduce_first.cl b/src/backend/opencl/kernel/reduce_first.cl
index 8349048..16dcf9d 100644
--- a/src/backend/opencl/kernel/reduce_first.cl
+++ b/src/backend/opencl/kernel/reduce_first.cl
@@ -12,7 +12,8 @@ void reduce_first_kernel(__global To *oData,
                          KParam oInfo,
                          const __global Ti *iData,
                          KParam iInfo,
-                         uint groups_x, uint groups_y, uint repeat)
+                         uint groups_x, uint groups_y, uint repeat,
+                         int change_nan, To nanval)
 {
     const uint lidx = get_local_id(0);
     const uint lidy = get_local_id(1);
@@ -40,6 +41,7 @@ void reduce_first_kernel(__global To *oData,
 
     for (int id = xid; cond && id < lim; id += DIMX) {
         To in_val = transform(iData[id]);
+        if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
         out_val = binOp(in_val, out_val);
     }
 
diff --git a/src/backend/opencl/reduce.hpp b/src/backend/opencl/reduce.hpp
index 6caa724..0ddc765 100644
--- a/src/backend/opencl/reduce.hpp
+++ b/src/backend/opencl/reduce.hpp
@@ -7,6 +7,7 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
 #include <af/array.h>
 #include <Array.hpp>
 #include <ops.hpp>
@@ -14,8 +15,8 @@
 namespace opencl
 {
     template<af_op_t op, typename Ti, typename To>
-    Array<To> reduce(const Array<Ti> &in, const int dim);
+    Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan=false, double nanval=0);
 
     template<af_op_t op, typename Ti, typename To>
-    To reduce_all(const Array<Ti> &in);
+    To reduce_all(const Array<Ti> &in, bool change_nan=false, double nanval=0);
 }
diff --git a/src/backend/opencl/reduce_impl.hpp b/src/backend/opencl/reduce_impl.hpp
index a031039..a6e8efb 100644
--- a/src/backend/opencl/reduce_impl.hpp
+++ b/src/backend/opencl/reduce_impl.hpp
@@ -21,22 +21,23 @@ using af::dim4;
 namespace opencl
 {
     template<af_op_t op, typename Ti, typename To>
-    Array<To> reduce(const Array<Ti> &in, const int dim)
+    Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nanval)
     {
         dim4 odims = in.dims();
         odims[dim] = 1;
         Array<To> out = createEmptyArray<To>(odims);
-        kernel::reduce<Ti, To, op>(out, in, dim);
+        kernel::reduce<Ti, To, op>(out, in, dim, change_nan, nanval);
         return out;
     }
 
     template<af_op_t op, typename Ti, typename To>
-    To reduce_all(const Array<Ti> &in)
+    To reduce_all(const Array<Ti> &in, bool change_nan, double nanval)
     {
-        return kernel::reduce_all<Ti, To, op>(in);
+        return kernel::reduce_all<Ti, To, op>(in, change_nan, nanval);
     }
 }
 
 #define INSTANTIATE(Op, Ti, To)                                         \
-    template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim); \
-    template To reduce_all<Op, Ti, To>(const Array<Ti> &in);
+    template Array<To> reduce<Op, Ti, To>(const Array<Ti> &in, const int dim, \
+                                          bool change_nan, double nanval); \
+    template To reduce_all<Op, Ti, To>(const Array<Ti> &in, bool change_nan, double nanval);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git