[arrayfire] 98/284: moved the left over fns to cpu kernel namespace
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Sun Feb 7 18:59:23 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.
commit 7d7f32ffd165f952e85cfe8d711ba147afbbe65d
Author: pradeep <pradeep at arrayfire.com>
Date: Sat Dec 19 16:04:37 2015 -0500
moved the left over fns to cpu kernel namespace
---
.../nearest_neighbour.hpp} | 71 +--
src/backend/cpu/{orb.cpp => kernel/orb.hpp} | 308 +-----------
src/backend/cpu/{random.cpp => kernel/random.hpp} | 98 +---
src/backend/cpu/kernel/range.hpp | 52 +++
src/backend/cpu/kernel/reduce.hpp | 71 +++
src/backend/cpu/kernel/regions.hpp | 194 ++++++++
.../cpu/{reorder.cpp => kernel/reorder.hpp} | 44 +-
src/backend/cpu/{resize.cpp => kernel/resize.hpp} | 55 +--
src/backend/cpu/{rotate.cpp => kernel/rotate.hpp} | 56 +--
src/backend/cpu/{scan.cpp => kernel/scan.hpp} | 61 +--
src/backend/cpu/kernel/select.hpp | 124 +++++
src/backend/cpu/kernel/shift.hpp | 69 +++
src/backend/cpu/{ => kernel}/sift_nonfree.hpp | 0
src/backend/cpu/{sobel.cpp => kernel/sobel.hpp} | 47 +-
src/backend/cpu/kernel/sort.hpp | 51 ++
.../{sort_by_key.cpp => kernel/sort_by_key.hpp} | 80 +---
.../cpu/{sort_index.cpp => kernel/sort_index.hpp} | 59 +--
src/backend/cpu/{susan.cpp => kernel/susan.hpp} | 77 +--
src/backend/cpu/kernel/tile.hpp | 55 +++
.../cpu/{transform.cpp => kernel/transform.hpp} | 60 +--
.../cpu/{transpose.cpp => kernel/transpose.hpp} | 70 +--
src/backend/cpu/kernel/triangle.hpp | 61 +++
src/backend/cpu/{unwrap.cpp => kernel/unwrap.hpp} | 57 +--
src/backend/cpu/{wrap.cpp => kernel/wrap.hpp} | 55 +--
src/backend/cpu/nearest_neighbour.cpp | 131 +-----
src/backend/cpu/orb.cpp | 520 +--------------------
src/backend/cpu/random.cpp | 176 ++-----
src/backend/cpu/range.cpp | 46 +-
src/backend/cpu/reduce.cpp | 60 +--
src/backend/cpu/regions.cpp | 175 +------
src/backend/cpu/reorder.cpp | 39 +-
src/backend/cpu/resize.cpp | 166 +------
src/backend/cpu/rotate.cpp | 71 +--
src/backend/cpu/scan.cpp | 61 +--
src/backend/cpu/select.cpp | 103 +---
src/backend/cpu/shift.cpp | 52 +--
src/backend/cpu/sift.cpp | 2 +-
src/backend/cpu/sobel.cpp | 71 +--
src/backend/cpu/sort.cpp | 44 +-
src/backend/cpu/sort_by_key.cpp | 83 +---
src/backend/cpu/sort_index.cpp | 61 +--
src/backend/cpu/susan.cpp | 84 +---
src/backend/cpu/tile.cpp | 38 +-
src/backend/cpu/transform.cpp | 93 +---
src/backend/cpu/transform_interp.hpp | 2 +
src/backend/cpu/transpose.cpp | 115 +----
src/backend/cpu/triangle.cpp | 42 +-
src/backend/cpu/unwrap.cpp | 67 +--
src/backend/cpu/wrap.cpp | 66 +--
49 files changed, 883 insertions(+), 3360 deletions(-)
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/kernel/nearest_neighbour.hpp
similarity index 56%
copy from src/backend/cpu/nearest_neighbour.cpp
copy to src/backend/cpu/kernel/nearest_neighbour.hpp
index b6f50c2..4916463 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/kernel/nearest_neighbour.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,19 +7,14 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/dim4.hpp>
+#pragma once
#include <af/defines.h>
-#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <err_cpu.hpp>
-#include <handle.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
namespace cpu
{
+namespace kernel
+{
#if defined(_WIN32) || defined(_MSC_VER)
@@ -92,9 +87,9 @@ struct dist_op<ushort, To, AF_SHD>
};
template<typename T, typename To, af_match_type dist_type>
-void nearest_neighbour_(Array<uint> idx, Array<To> dist,
- const Array<T> query, const Array<T> train,
- const uint dist_dim, const uint n_dist)
+void nearest_neighbour(Array<uint> idx, Array<To> dist,
+ const Array<T> query, const Array<T> train,
+ const uint dist_dim, const uint n_dist)
{
uint sample_dim = (dist_dim == 0) ? 1 : 0;
const dim4 qDims = query.dims();
@@ -144,57 +139,5 @@ void nearest_neighbour_(Array<uint> idx, Array<To> dist,
}
}
-template<typename T, typename To>
-void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
- const Array<T>& query, const Array<T>& train,
- const uint dist_dim, const uint n_dist,
- const af_match_type dist_type)
-{
- if (n_dist > 1) {
- CPU_NOT_SUPPORTED();
- }
-
- query.eval();
- train.eval();
-
- uint sample_dim = (dist_dim == 0) ? 1 : 0;
- const dim4 qDims = query.dims();
- const dim4 outDims(n_dist, qDims[sample_dim]);
-
- idx = createEmptyArray<uint>(outDims);
- dist = createEmptyArray<To >(outDims);
-
- switch(dist_type) {
- case AF_SAD:
- getQueue().enqueue(nearest_neighbour_<T, To, AF_SAD>, idx, dist, query, train, dist_dim, n_dist);
- break;
- case AF_SSD:
- getQueue().enqueue(nearest_neighbour_<T, To, AF_SSD>, idx, dist, query, train, dist_dim, n_dist);
- break;
- case AF_SHD:
- getQueue().enqueue(nearest_neighbour_<T, To, AF_SHD>, idx, dist, query, train, dist_dim, n_dist);
- break;
- default:
- AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
- }
}
-
-#define INSTANTIATE(T, To) \
- template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist, \
- const Array<T>& query, const Array<T>& train, \
- const uint dist_dim, const uint n_dist, \
- const af_match_type dist_type);
-
-INSTANTIATE(float , float)
-INSTANTIATE(double, double)
-INSTANTIATE(int , int)
-INSTANTIATE(uint , uint)
-INSTANTIATE(intl , intl)
-INSTANTIATE(uintl , uintl)
-INSTANTIATE(uchar , uint)
-INSTANTIATE(ushort, uint)
-INSTANTIATE(short , int)
-
-INSTANTIATE(uintl , uint) // For Hamming
-
}
diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/kernel/orb.hpp
similarity index 52%
copy from src/backend/cpu/orb.cpp
copy to src/backend/cpu/kernel/orb.hpp
index 4b6629c..acd508c 100644
--- a/src/backend/cpu/orb.cpp
+++ b/src/backend/cpu/kernel/orb.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,27 +7,15 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/dim4.hpp>
+#pragma once
#include <af/defines.h>
-#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <err_cpu.hpp>
-#include <handle.hpp>
-#include <resize.hpp>
-#include <fast.hpp>
-#include <sort_index.hpp>
-#include <convolve.hpp>
-#include <memory.hpp>
-#include <cstring>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
+#include <utility.hpp>
namespace cpu
{
-
-static const float PI_VAL = 3.14159265358979323846f;
+namespace kernel
+{
// Reference pattern, generated for a patch size of 31x31, as suggested by
// original ORB paper
@@ -299,24 +287,6 @@ const int ref_pat[REF_PAT_LENGTH] = {
};
template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
- if(!(sigma>0)) sigma = 0.25*dim;
-
- T sum = (T)0;
- for(int i=0;i<dim;i++)
- {
- int x = i-(dim-1)/2;
- T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
- out[i] = el;
- sum += el;
- }
-
- for(int k=0;k<dim;k++)
- out[k] /= sum;
-}
-
-template<typename T>
void keep_features(
float* x_out,
float* y_out,
@@ -535,273 +505,5 @@ void extract_orb(
-template<typename T, typename convAccT>
-unsigned orb(Array<float> &x, Array<float> &y,
- Array<float> &score, Array<float> &ori,
- Array<float> &size, Array<uint> &desc,
- const Array<T>& image,
- const float fast_thr, const unsigned max_feat,
- const float scl_fctr, const unsigned levels,
- const bool blur_img)
-{
- image.eval();
- getQueue().sync();
-
- unsigned patch_size = REF_PAT_SIZE;
-
- const af::dim4 idims = image.dims();
- unsigned min_side = std::min(idims[0], idims[1]);
- unsigned max_levels = 0;
- float scl_sum = 0.f;
-
- for (unsigned i = 0; i < levels; i++) {
- min_side /= scl_fctr;
-
- // Minimum image side for a descriptor to be computed
- if (min_side < patch_size || max_levels == levels) break;
-
- max_levels++;
- scl_sum += 1.f / (float)std::pow(scl_fctr,(float)i);
- }
-
- std::vector<float*> h_x_pyr(max_levels);
- std::vector<float*> h_y_pyr(max_levels);
- std::vector<float*> h_score_pyr(max_levels);
- std::vector<float*> h_ori_pyr(max_levels);
- std::vector<float*> h_size_pyr(max_levels);
- std::vector<unsigned*> h_desc_pyr(max_levels);
-
- std::vector<unsigned> feat_pyr(max_levels);
- unsigned total_feat = 0;
-
- // Compute number of features to keep for each level
- std::vector<unsigned> lvl_best(max_levels);
- unsigned feat_sum = 0;
- for (unsigned i = 0; i < max_levels-1; i++) {
- float lvl_scl = (float)std::pow(scl_fctr,(float)i);
- lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl);
- feat_sum += lvl_best[i];
- }
- lvl_best[max_levels-1] = max_feat - feat_sum;
-
- // Maintain a reference to previous level image
- Array<T> prev_img = createEmptyArray<T>(af::dim4());
- af::dim4 prev_ldims;
-
- af::dim4 gauss_dims(9);
- T* h_gauss = nullptr;
- Array<T> gauss_filter = createEmptyArray<T>(af::dim4());
-
- for (unsigned i = 0; i < max_levels; i++) {
- af::dim4 ldims;
- const float lvl_scl = (float)std::pow(scl_fctr,(float)i);
- Array<T> lvl_img = createEmptyArray<T>(af::dim4());
-
- if (i == 0) {
- // First level is used in its original size
- lvl_img = image;
- ldims = image.dims();
-
- prev_img = image;
- prev_ldims = image.dims();
- }
- else {
- // Resize previous level image to current level dimensions
- ldims[0] = round(idims[0] / lvl_scl);
- ldims[1] = round(idims[1] / lvl_scl);
-
- lvl_img = resize<T>(prev_img, ldims[0], ldims[1], AF_INTERP_BILINEAR);
- lvl_img.eval();
- getQueue().sync();
-
- prev_img = lvl_img;
- prev_ldims = lvl_img.dims();
- }
-
-
- Array<float> x_feat = createEmptyArray<float>(dim4());
- Array<float> y_feat = createEmptyArray<float>(dim4());
- Array<float> score_feat = createEmptyArray<float>(dim4());
-
- // Round feature size to nearest odd integer
- float size = 2.f * floor(patch_size / 2.f) + 1.f;
-
- // Avoid keeping features that might be too wide and might not fit on
- // the image, sqrt(2.f) is the radius when angle is 45 degrees and
- // represents widest case possible
- unsigned edge = ceil(size * sqrt(2.f) / 2.f);
-
- unsigned lvl_feat = fast(x_feat, y_feat, score_feat,
- lvl_img, fast_thr, 9, 1, 0.15f, edge);
- x_feat.eval();
- y_feat.eval();
- score_feat.eval();
- getQueue().sync();
-
- if (lvl_feat == 0) {
- continue;
- }
-
- float* h_x_feat = x_feat.get();
- float* h_y_feat = y_feat.get();
-
- float* h_x_harris = memAlloc<float>(lvl_feat);
- float* h_y_harris = memAlloc<float>(lvl_feat);
- float* h_score_harris = memAlloc<float>(lvl_feat);
-
- // Calculate Harris responses
- // Good block_size >= 7 (must be an odd number)
- unsigned usable_feat = 0;
- harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
- h_x_feat, h_y_feat, nullptr,
- lvl_feat, &usable_feat,
- lvl_img,
- 7, 0.04f, patch_size);
-
- if (usable_feat == 0) {
- memFree(h_x_harris);
- memFree(h_y_harris);
- memFree(h_score_harris);
- continue;
- }
-
- // Sort features according to Harris responses
- af::dim4 usable_feat_dims(usable_feat);
- Array<float> score_harris = createDeviceDataArray<float>(usable_feat_dims, h_score_harris);
- Array<float> harris_sorted = createEmptyArray<float>(af::dim4());
- Array<unsigned> harris_idx = createEmptyArray<unsigned>(af::dim4());
-
- sort_index<float, false>(harris_sorted, harris_idx, score_harris, 0);
- harris_sorted.eval();
- harris_idx.eval();
- getQueue().sync();
-
- usable_feat = std::min(usable_feat, lvl_best[i]);
-
- if (usable_feat == 0) {
- memFree(h_x_harris);
- memFree(h_y_harris);
- continue;
- }
-
- float* h_x_lvl = memAlloc<float>(usable_feat);
- float* h_y_lvl = memAlloc<float>(usable_feat);
- float* h_score_lvl = memAlloc<float>(usable_feat);
-
- // Keep only features with higher Harris responses
- keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
- h_x_harris, h_y_harris, harris_sorted.get(), harris_idx.get(),
- nullptr, usable_feat);
-
- memFree(h_x_harris);
- memFree(h_y_harris);
-
- float* h_ori_lvl = memAlloc<float>(usable_feat);
- float* h_size_lvl = memAlloc<float>(usable_feat);
-
- // Compute orientation of features
- centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
- lvl_img, patch_size);
-
- Array<T> lvl_filt = createEmptyArray<T>(dim4());
-
- if (blur_img) {
- // Calculate a separable Gaussian kernel, if one is not already stored
- if (!h_gauss) {
- h_gauss = memAlloc<T>(gauss_dims[0]);
- gaussian1D(h_gauss, gauss_dims[0], 2.f);
- gauss_filter = createDeviceDataArray<T>(gauss_dims, h_gauss);
- }
-
- // Filter level image with Gaussian kernel to reduce noise sensitivity
- lvl_filt = convolve2<T, convAccT, false>(lvl_img, gauss_filter, gauss_filter);
- }
- lvl_filt.eval();
- getQueue().sync();
-
- // Compute ORB descriptors
- unsigned* h_desc_lvl = memAlloc<unsigned>(usable_feat * 8);
- memset(h_desc_lvl, 0, usable_feat * 8 * sizeof(unsigned));
- if (blur_img)
- extract_orb<T>(h_desc_lvl, usable_feat,
- h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
- lvl_filt, lvl_scl, patch_size);
- else
- extract_orb<T>(h_desc_lvl, usable_feat,
- h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
- lvl_img, lvl_scl, patch_size);
-
- // Store results to pyramids
- total_feat += usable_feat;
- feat_pyr[i] = usable_feat;
- h_x_pyr[i] = h_x_lvl;
- h_y_pyr[i] = h_y_lvl;
- h_score_pyr[i] = h_score_lvl;
- h_ori_pyr[i] = h_ori_lvl;
- h_size_pyr[i] = h_size_lvl;
- h_desc_pyr[i] = h_desc_lvl;
-
- }
-
- if (total_feat > 0 ) {
-
- // Allocate feature Arrays
- const af::dim4 total_feat_dims(total_feat);
- const af::dim4 desc_dims(8, total_feat);
-
- x = createEmptyArray<float>(total_feat_dims);
- y = createEmptyArray<float>(total_feat_dims);
- score = createEmptyArray<float>(total_feat_dims);
- ori = createEmptyArray<float>(total_feat_dims);
- size = createEmptyArray<float>(total_feat_dims);
- desc = createEmptyArray<uint >(desc_dims);
-
- float* h_x = x.get();
- float* h_y = y.get();
- float* h_score = score.get();
- float* h_ori = ori.get();
- float* h_size = size.get();
-
- unsigned* h_desc = desc.get();
-
- unsigned offset = 0;
- for (unsigned i = 0; i < max_levels; i++) {
- if (feat_pyr[i] == 0)
- continue;
-
- if (i > 0)
- offset += feat_pyr[i-1];
-
- memcpy(h_x+offset, h_x_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(h_y+offset, h_y_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(h_score+offset, h_score_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(h_ori+offset, h_ori_pyr[i], feat_pyr[i] * sizeof(float));
- memcpy(h_size+offset, h_size_pyr[i], feat_pyr[i] * sizeof(float));
-
- memcpy(h_desc+(offset*8), h_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned));
-
- memFree(h_x_pyr[i]);
- memFree(h_y_pyr[i]);
- memFree(h_score_pyr[i]);
- memFree(h_ori_pyr[i]);
- memFree(h_size_pyr[i]);
- memFree(h_desc_pyr[i]);
- }
- }
-
- return total_feat;
}
-
-#define INSTANTIATE(T, convAccT) \
- template unsigned orb<T, convAccT>(Array<float> &x, Array<float> &y, \
- Array<float> &score, Array<float> &ori, \
- Array<float> &size, Array<uint> &desc, \
- const Array<T>& image, \
- const float fast_thr, const unsigned max_feat, \
- const float scl_fctr, const unsigned levels, \
- const bool blur_img);
-
-INSTANTIATE(float , float )
-INSTANTIATE(double, double)
-
}
diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/kernel/random.hpp
similarity index 61%
copy from src/backend/cpu/random.cpp
copy to src/backend/cpu/kernel/random.hpp
index 8c83ad6..357cbd2 100644
--- a/src/backend/cpu/random.cpp
+++ b/src/backend/cpu/kernel/random.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,22 +7,20 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
#include <type_traits>
#include <random>
#include <algorithm>
#include <functional>
#include <limits>
#include <type_traits>
-#include <af/array.h>
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <Array.hpp>
-#include <random.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
namespace cpu
{
+namespace kernel
+{
using namespace std;
@@ -76,7 +74,7 @@ static bool is_first = true;
#define GLOBAL 1
template<typename T>
-void randn_(Array<T> out)
+void randn(Array<T> out)
{
static unsigned long long my_seed = 0;
if (is_first) {
@@ -98,15 +96,7 @@ void randn_(Array<T> out)
}
template<typename T>
-Array<T> randn(const af::dim4 &dims)
-{
- Array<T> outArray = createEmptyArray<T>(dims);
- getQueue().enqueue(randn_<T>, outArray);
- return outArray;
-}
-
-template<typename T>
-void randu_(Array<T> out)
+void randu(Array<T> out)
{
static unsigned long long my_seed = 0;
if (is_first) {
@@ -128,7 +118,7 @@ void randu_(Array<T> out)
}
template<>
-void randu_(Array<char> out)
+void randu(Array<char> out)
{
static unsigned long long my_seed = 0;
if (is_first) {
@@ -149,75 +139,5 @@ void randu_(Array<char> out)
}
}
-template<typename T>
-Array<T> randu(const af::dim4 &dims)
-{
- Array<T> outArray = createEmptyArray<T>(dims);
- getQueue().enqueue(randu_<T>, outArray);
- return outArray;
-}
-
-#define INSTANTIATE_UNIFORM(T) \
- template Array<T> randu<T> (const af::dim4 &dims);
-
-INSTANTIATE_UNIFORM(float)
-INSTANTIATE_UNIFORM(double)
-INSTANTIATE_UNIFORM(cfloat)
-INSTANTIATE_UNIFORM(cdouble)
-INSTANTIATE_UNIFORM(int)
-INSTANTIATE_UNIFORM(uint)
-INSTANTIATE_UNIFORM(intl)
-INSTANTIATE_UNIFORM(uintl)
-INSTANTIATE_UNIFORM(uchar)
-INSTANTIATE_UNIFORM(short)
-INSTANTIATE_UNIFORM(ushort)
-
-#define INSTANTIATE_NORMAL(T) \
- template Array<T> randn<T>(const af::dim4 &dims);
-
-INSTANTIATE_NORMAL(float)
-INSTANTIATE_NORMAL(double)
-INSTANTIATE_NORMAL(cfloat)
-INSTANTIATE_NORMAL(cdouble)
-
-template<>
-Array<char> randu(const af::dim4 &dims)
-{
- static unsigned long long my_seed = 0;
- if (is_first) {
- setSeed(gen_seed);
- my_seed = gen_seed;
- }
-
- static auto gen = urand<float>(generator);
-
- if (my_seed != gen_seed) {
- gen = urand<float>(generator);
- my_seed = gen_seed;
- }
-
- Array<char> outArray = createEmptyArray<char>(dims);
- char *outPtr = outArray.get();
- for (int i = 0; i < (int)outArray.elements(); i++) {
- outPtr[i] = gen() > 0.5;
- }
- return outArray;
-}
-
-void setSeed(const uintl seed)
-{
- auto f = [=](const uintl seed){
- generator.seed(seed);
- is_first = false;
- gen_seed = seed;
- };
- getQueue().enqueue(f, seed);
}
-
-uintl getSeed()
-{
- getQueue().sync();
- return gen_seed;
-}
-
}
diff --git a/src/backend/cpu/kernel/range.hpp b/src/backend/cpu/kernel/range.hpp
new file mode 100644
index 0000000..b244a19
--- /dev/null
+++ b/src/backend/cpu/kernel/range.hpp
@@ -0,0 +1,52 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, int dim>
+void range(Array<T> output)
+{
+ T* out = output.get();
+
+ const dim4 dims = output.dims();
+ const dim4 strides = output.strides();
+
+ for(dim_t w = 0; w < dims[3]; w++) {
+ dim_t offW = w * strides[3];
+ for(dim_t z = 0; z < dims[2]; z++) {
+ dim_t offWZ = offW + z * strides[2];
+ for(dim_t y = 0; y < dims[1]; y++) {
+ dim_t offWZY = offWZ + y * strides[1];
+ for(dim_t x = 0; x < dims[0]; x++) {
+ dim_t id = offWZY + x;
+ if(dim == 0) {
+ out[id] = x;
+ } else if(dim == 1) {
+ out[id] = y;
+ } else if(dim == 2) {
+ out[id] = z;
+ } else if(dim == 3) {
+ out[id] = w;
+ }
+ }
+ }
+ }
+ }
+}
+
+}
+}
+
diff --git a/src/backend/cpu/kernel/reduce.hpp b/src/backend/cpu/kernel/reduce.hpp
new file mode 100644
index 0000000..85119dc
--- /dev/null
+++ b/src/backend/cpu/kernel/reduce.hpp
@@ -0,0 +1,71 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<af_op_t op, typename Ti, typename To, int D>
+struct reduce_dim
+{
+ void operator()(Array<To> out, const dim_t outOffset,
+ const Array<Ti> in, const dim_t inOffset,
+ const int dim, bool change_nan, double nanval)
+ {
+ static const int D1 = D - 1;
+ static reduce_dim<op, Ti, To, D1> reduce_dim_next;
+
+ const af::dim4 ostrides = out.strides();
+ const af::dim4 istrides = in.strides();
+ const af::dim4 odims = out.dims();
+
+ for (dim_t i = 0; i < odims[D1]; i++) {
+ reduce_dim_next(out, outOffset + i * ostrides[D1],
+ in, inOffset + i * istrides[D1],
+ dim, change_nan, nanval);
+ }
+ }
+};
+
+template<af_op_t op, typename Ti, typename To>
+struct reduce_dim<op, Ti, To, 0>
+{
+
+ Transform<Ti, To, op> transform;
+ Binary<To, op> reduce;
+ void operator()(Array<To> out, const dim_t outOffset,
+ const Array<Ti> in, const dim_t inOffset,
+ const int dim, bool change_nan, double nanval)
+ {
+ const af::dim4 istrides = in.strides();
+ const af::dim4 idims = in.dims();
+
+ To * const outPtr = out.get() + outOffset;
+ Ti const * const inPtr = in.get() + inOffset;
+ dim_t stride = istrides[dim];
+
+ To out_val = reduce.init();
+ for (dim_t i = 0; i < idims[dim]; i++) {
+ To in_val = transform(inPtr[i * stride]);
+ if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+ out_val = reduce(in_val, out_val);
+ }
+
+ *outPtr = out_val;
+ }
+};
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/regions.hpp b/src/backend/cpu/kernel/regions.hpp
new file mode 100644
index 0000000..863ebc5
--- /dev/null
+++ b/src/backend/cpu/kernel/regions.hpp
@@ -0,0 +1,194 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+class LabelNode
+{
+private:
+ T label;
+ T minLabel;
+ unsigned rank;
+ LabelNode* parent;
+
+public:
+ LabelNode() : label(0), minLabel(0), rank(0), parent(this) { }
+ LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { }
+
+ T getLabel()
+ {
+ return label;
+ }
+
+ T getMinLabel()
+ {
+ return minLabel;
+ }
+
+ LabelNode* getParent()
+ {
+ return parent;
+ }
+
+ unsigned getRank()
+ {
+ return rank;
+ }
+
+ void setMinLabel(T l)
+ {
+ minLabel = l;
+ }
+
+ void setParent(LabelNode* p)
+ {
+ parent = p;
+ }
+
+ void setRank(unsigned r)
+ {
+ rank = r;
+ }
+};
+
+template<typename T>
+static LabelNode<T>* find(LabelNode<T>* x)
+{
+ if (x->getParent() != x)
+ x->setParent(find(x->getParent()));
+ return x->getParent();
+}
+
+template<typename T>
+static void setUnion(LabelNode<T>* x, LabelNode<T>* y)
+{
+ LabelNode<T>* xRoot = find(x);
+ LabelNode<T>* yRoot = find(y);
+ if (xRoot == yRoot)
+ return;
+
+ T xMinLabel = xRoot->getMinLabel();
+ T yMinLabel = yRoot->getMinLabel();
+ xRoot->setMinLabel(min(xMinLabel, yMinLabel));
+ yRoot->setMinLabel(min(xMinLabel, yMinLabel));
+
+ if (xRoot->getRank() < yRoot->getRank())
+ xRoot->setParent(yRoot);
+ else if (xRoot->getRank() > yRoot->getRank())
+ yRoot->setParent(xRoot);
+ else {
+ yRoot->setParent(xRoot);
+ xRoot->setRank(xRoot->getRank() + 1);
+ }
+}
+
+template<typename T>
+void regions(Array<T> out, const Array<char> in, af_connectivity connectivity)
+{
+ const af::dim4 in_dims = in.dims();
+ const char *in_ptr = in.get();
+ T *out_ptr = out.get();
+
+ // Map labels
+ typedef typename std::map<T, LabelNode<T>* > label_map_t;
+ typedef typename label_map_t::iterator label_map_iterator_t;
+
+ label_map_t lmap;
+
+ // Initial label
+ T label = (T)1;
+
+ for (int j = 0; j < (int)in_dims[1]; j++) {
+ for (int i = 0; i < (int)in_dims[0]; i++) {
+ int idx = j * in_dims[0] + i;
+ if (in_ptr[idx] != 0) {
+ std::vector<T> l;
+
+ // Test neighbors
+ if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0)
+ l.push_back(out_ptr[j * in_dims[0] + i-1]);
+ if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0)
+ l.push_back(out_ptr[(j-1) * in_dims[0] + i]);
+ if (connectivity == AF_CONNECTIVITY_8 && i > 0 &&
+ j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0)
+ l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]);
+ if (connectivity == AF_CONNECTIVITY_8 &&
+ i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0)
+ l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]);
+
+ if (!l.empty()) {
+ T minl = l[0];
+ for (size_t k = 0; k < l.size(); k++) {
+ minl = min(l[k], minl);
+ label_map_iterator_t cur_map = lmap.find(l[k]);
+ LabelNode<T> *node = cur_map->second;
+ // Group labels of the same region under a disjoint set
+ for (size_t m = k+1; m < l.size(); m++)
+ setUnion(node, lmap.find(l[m])->second);
+ }
+ // Set label to smallest neighbor label
+ out_ptr[idx] = minl;
+ }
+ else {
+ // Insert new label in map
+ LabelNode<T> *node = new LabelNode<T>(label);
+ lmap.insert(std::pair<T, LabelNode<T>* >(label, node));
+ out_ptr[idx] = label++;
+ }
+ }
+ }
+ }
+
+ std::set<T> removed;
+
+ for (int j = 0; j < (int)in_dims[1]; j++) {
+ for (int i = 0; i < (int)in_dims[0]; i++) {
+ int idx = j * (int)in_dims[0] + i;
+ if (in_ptr[idx] != 0) {
+ T l = out_ptr[idx];
+ label_map_iterator_t cur_map = lmap.find(l);
+
+ if (cur_map != lmap.end()) {
+ LabelNode<T>* node = cur_map->second;
+
+ LabelNode<T>* node_root = find(node);
+ out_ptr[idx] = node_root->getMinLabel();
+
+ // Mark removed labels (those that are part of a region
+ // that contains a smaller label)
+ if (node->getMinLabel() < l || node_root->getMinLabel() < l)
+ removed.insert(l);
+ if (node->getLabel() > node->getMinLabel())
+ removed.insert(node->getLabel());
+ }
+ }
+ }
+ }
+
+ // Calculate final neighbors (ensure final labels are sequential)
+ for (int j = 0; j < (int)in_dims[1]; j++) {
+ for (int i = 0; i < (int)in_dims[0]; i++) {
+ int idx = j * (int)in_dims[0] + i;
+ if (out_ptr[idx] > 0) {
+ out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx]));
+ }
+ }
+ }
+}
+
+}
+}
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/kernel/reorder.hpp
similarity index 57%
copy from src/backend/cpu/reorder.cpp
copy to src/backend/cpu/kernel/reorder.hpp
index 1ad7dad..c10c96e 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/kernel/reorder.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,18 +7,17 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <reorder.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
namespace cpu
{
+namespace kernel
+{
template<typename T>
-void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
+void reorder(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
{
T* outPtr = out.get();
const T* inPtr = in.get();
@@ -51,35 +50,6 @@ void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::d
}
}
-template<typename T>
-Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
-{
- in.eval();
-
- const af::dim4 iDims = in.dims();
- af::dim4 oDims(0);
- for(int i = 0; i < 4; i++)
- oDims[i] = iDims[rdims[i]];
-
- Array<T> out = createEmptyArray<T>(oDims);
- getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
- return out;
}
-
-#define INSTANTIATE(T) \
- template Array<T> reorder<T>(const Array<T> &in, const af::dim4 &rdims); \
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
+
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/kernel/resize.hpp
similarity index 79%
copy from src/backend/cpu/resize.cpp
copy to src/backend/cpu/kernel/resize.hpp
index 8fb2edc..19d7ec7 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/kernel/resize.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,18 +7,14 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <resize.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <math.hpp>
-#include <types.hpp>
-#include <af/traits.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
namespace cpu
{
+namespace kernel
+{
/**
* noop function for round to avoid compilation
@@ -160,7 +156,7 @@ struct resize_op<T, AF_INTERP_LOWER>
};
template<typename T, af_interp_type method>
-void resize_(Array<T> out, const Array<T> in)
+void resize(Array<T> out, const Array<T> in)
{
af::dim4 idims = in.dims();
af::dim4 odims = out.dims();
@@ -177,44 +173,5 @@ void resize_(Array<T> out, const Array<T> in)
}
}
-template<typename T>
-Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
- const af_interp_type method)
-{
- af::dim4 idims = in.dims();
- af::dim4 odims(odim0, odim1, idims[2], idims[3]);
- // Create output placeholder
- Array<T> out = createValueArray(odims, (T)0);
- out.eval();
- in.eval();
-
- switch(method) {
- case AF_INTERP_NEAREST:
- getQueue().enqueue(resize_<T, AF_INTERP_NEAREST>, out, in); break;
- case AF_INTERP_BILINEAR:
- getQueue().enqueue(resize_<T, AF_INTERP_BILINEAR>, out, in); break;
- case AF_INTERP_LOWER:
- getQueue().enqueue(resize_<T, AF_INTERP_LOWER>, out, in); break;
- default: break;
- }
- return out;
}
-
-#define INSTANTIATE(T) \
- template Array<T> resize<T> (const Array<T> &in, const dim_t odim0, const dim_t odim1, \
- const af_interp_type method);
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/kernel/rotate.hpp
similarity index 61%
copy from src/backend/cpu/rotate.cpp
copy to src/backend/cpu/kernel/rotate.hpp
index 5687d69..6e4f758 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/kernel/rotate.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,20 +7,19 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <rotate.hpp>
#include <math.hpp>
-#include <stdexcept>
#include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-#include "transform_interp.hpp"
namespace cpu
{
+namespace kernel
+{
template<typename T, af_interp_type method>
-void rotate_(Array<T> output, const Array<T> input, const float theta)
+void rotate(Array<T> output, const Array<T> input, const float theta)
{
const af::dim4 odims = output.dims();
const af::dim4 idims = input.dims();
@@ -80,48 +79,5 @@ void rotate_(Array<T> output, const Array<T> input, const float theta)
}
}
-template<typename T>
-Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
- const af_interp_type method)
-{
- in.eval();
-
- Array<T> out = createEmptyArray<T>(odims);
-
- switch(method) {
- case AF_INTERP_NEAREST:
- getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
- break;
- case AF_INTERP_BILINEAR:
- getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
- break;
- case AF_INTERP_LOWER:
- getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
- break;
- default:
- AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
- break;
- }
-
- return out;
}
-
-
-#define INSTANTIATE(T) \
- template Array<T> rotate(const Array<T> &in, const float theta, \
- const af::dim4 &odims, const af_interp_type method);
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/kernel/scan.hpp
similarity index 53%
copy from src/backend/cpu/scan.cpp
copy to src/backend/cpu/kernel/scan.hpp
index 39157ca..0bcfe7d 100644
--- a/src/backend/cpu/scan.cpp
+++ b/src/backend/cpu/kernel/scan.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,20 +7,14 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <complex>
-#include <af/dim4.hpp>
+#pragma once
#include <af/defines.h>
-#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <scan.hpp>
-#include <ops.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
namespace cpu
{
+namespace kernel
+{
template<af_op_t op, typename Ti, typename To, int D>
struct scan_dim
@@ -74,52 +68,5 @@ struct scan_dim<op, Ti, To, 0>
}
};
-template<af_op_t op, typename Ti, typename To>
-Array<To> scan(const Array<Ti>& in, const int dim)
-{
- dim4 dims = in.dims();
- Array<To> out = createValueArray<To>(dims, 0);
- out.eval();
- in.eval();
-
- switch (in.ndims()) {
- case 1:
- scan_dim<op, Ti, To, 1> func1;
- getQueue().enqueue(func1, out, 0, in, 0, dim);
- break;
- case 2:
- scan_dim<op, Ti, To, 2> func2;
- getQueue().enqueue(func2, out, 0, in, 0, dim);
- break;
- case 3:
- scan_dim<op, Ti, To, 3> func3;
- getQueue().enqueue(func3, out, 0, in, 0, dim);
- break;
- case 4:
- scan_dim<op, Ti, To, 4> func4;
- getQueue().enqueue(func4, out, 0, in, 0, dim);
- break;
- }
-
- return out;
}
-
-#define INSTANTIATE(ROp, Ti, To) \
- template Array<To> scan<ROp, Ti, To>(const Array<Ti> &in, const int dim); \
-
-//accum
-INSTANTIATE(af_add_t, float , float )
-INSTANTIATE(af_add_t, double , double )
-INSTANTIATE(af_add_t, cfloat , cfloat )
-INSTANTIATE(af_add_t, cdouble, cdouble)
-INSTANTIATE(af_add_t, int , int )
-INSTANTIATE(af_add_t, uint , uint )
-INSTANTIATE(af_add_t, intl , intl )
-INSTANTIATE(af_add_t, uintl , uintl )
-INSTANTIATE(af_add_t, char , int )
-INSTANTIATE(af_add_t, uchar , uint )
-INSTANTIATE(af_add_t, short , int )
-INSTANTIATE(af_add_t, ushort , uint )
-INSTANTIATE(af_notzero_t, char , uint)
-
}
diff --git a/src/backend/cpu/kernel/select.hpp b/src/backend/cpu/kernel/select.hpp
new file mode 100644
index 0000000..1099c7e
--- /dev/null
+++ b/src/backend/cpu/kernel/select.hpp
@@ -0,0 +1,124 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void select(Array<T> out, const Array<char> cond, const Array<T> a, const Array<T> b)
+{
+ af::dim4 adims = a.dims();
+ af::dim4 astrides = a.strides();
+ af::dim4 bdims = b.dims();
+ af::dim4 bstrides = b.strides();
+
+ af::dim4 cdims = cond.dims();
+ af::dim4 cstrides = cond.strides();
+
+ af::dim4 odims = out.dims();
+ af::dim4 ostrides = out.strides();
+
+ bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1],
+ adims[2] == odims[2], adims[3] == odims[3]};
+
+ bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1],
+ bdims[2] == odims[2], bdims[3] == odims[3]};
+
+ bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1],
+ cdims[2] == odims[2], cdims[3] == odims[3]};
+
+ const T *aptr = a.get();
+ const T *bptr = b.get();
+ T *optr = out.get();
+ const char *cptr = cond.get();
+
+ for (int l = 0; l < odims[3]; l++) {
+
+ int o_off3 = ostrides[3] * l;
+ int a_off3 = astrides[3] * is_a_same[3] * l;
+ int b_off3 = bstrides[3] * is_b_same[3] * l;
+ int c_off3 = cstrides[3] * is_c_same[3] * l;
+
+ for (int k = 0; k < odims[2]; k++) {
+
+ int o_off2 = ostrides[2] * k + o_off3;
+ int a_off2 = astrides[2] * is_a_same[2] * k + a_off3;
+ int b_off2 = bstrides[2] * is_b_same[2] * k + b_off3;
+ int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3;
+
+ for (int j = 0; j < odims[1]; j++) {
+
+ int o_off1 = ostrides[1] * j + o_off2;
+ int a_off1 = astrides[1] * is_a_same[1] * j + a_off2;
+ int b_off1 = bstrides[1] * is_b_same[1] * j + b_off2;
+ int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2;
+
+ for (int i = 0; i < odims[0]; i++) {
+
+ bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1];
+ T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1];
+ T bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1];
+ T oval = cval ? aval : bval;
+ optr[o_off1 + i] = oval;
+ }
+ }
+ }
+ }
+}
+
+template<typename T, bool flip>
+void select_scalar(Array<T> out, const Array<char> cond, const Array<T> a, const double b)
+{
+ af::dim4 astrides = a.strides();
+ af::dim4 cstrides = cond.strides();
+
+ af::dim4 odims = out.dims();
+ af::dim4 ostrides = out.strides();
+
+ const T *aptr = a.get();
+ T *optr = out.get();
+ const char *cptr = cond.get();
+
+ for (int l = 0; l < odims[3]; l++) {
+
+ int o_off3 = ostrides[3] * l;
+ int a_off3 = astrides[3] * l;
+ int c_off3 = cstrides[3] * l;
+
+ for (int k = 0; k < odims[2]; k++) {
+
+ int o_off2 = ostrides[2] * k + o_off3;
+ int a_off2 = astrides[2] * k + a_off3;
+ int c_off2 = cstrides[2] * k + c_off3;
+
+ for (int j = 0; j < odims[1]; j++) {
+
+ int o_off1 = ostrides[1] * j + o_off2;
+ int a_off1 = astrides[1] * j + a_off2;
+ int c_off1 = cstrides[1] * j + c_off2;
+
+ for (int i = 0; i < odims[0]; i++) {
+
+ optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b;
+ }
+ }
+ }
+ }
+}
+
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/shift.hpp b/src/backend/cpu/kernel/shift.hpp
new file mode 100644
index 0000000..8beb975
--- /dev/null
+++ b/src/backend/cpu/kernel/shift.hpp
@@ -0,0 +1,69 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <cassert>
+
+namespace cpu
+{
+namespace kernel
+{
+
+static inline dim_t simple_mod(const dim_t i, const dim_t dim)
+{
+ return (i < dim) ? i : (i - dim);
+}
+
+template<typename T>
+void shift(Array<T> out, const Array<T> in, const af::dim4 sdims)
+{
+ T* outPtr = out.get();
+ const T* inPtr = in.get();
+
+ const af::dim4 oDims = out.dims();
+ const af::dim4 ist = in.strides();
+ const af::dim4 ost = out.strides();
+
+ int sdims_[4];
+ // Need to do this because we are mapping output to input in the kernel
+ for(int i = 0; i < 4; i++) {
+ // sdims_[i] will always be positive and always [0, oDims[i]].
+ // Negative shifts are converted to position by going the other way round
+ sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0);
+ assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]);
+ }
+
+ for(dim_t ow = 0; ow < oDims[3]; ow++) {
+ const int oW = ow * ost[3];
+ const int iw = simple_mod((ow + sdims_[3]), oDims[3]);
+ const int iW = iw * ist[3];
+ for(dim_t oz = 0; oz < oDims[2]; oz++) {
+ const int oZW = oW + oz * ost[2];
+ const int iz = simple_mod((oz + sdims_[2]), oDims[2]);
+ const int iZW = iW + iz * ist[2];
+ for(dim_t oy = 0; oy < oDims[1]; oy++) {
+ const int oYZW = oZW + oy * ost[1];
+ const int iy = simple_mod((oy + sdims_[1]), oDims[1]);
+ const int iYZW = iZW + iy * ist[1];
+ for(dim_t ox = 0; ox < oDims[0]; ox++) {
+ const int oIdx = oYZW + ox;
+ const int ix = simple_mod((ox + sdims_[0]), oDims[0]);
+ const int iIdx = iYZW + ix;
+
+ outPtr[oIdx] = inPtr[iIdx];
+ }
+ }
+ }
+ }
+}
+
+}
+}
diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/kernel/sift_nonfree.hpp
similarity index 100%
rename from src/backend/cpu/sift_nonfree.hpp
rename to src/backend/cpu/kernel/sift_nonfree.hpp
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/kernel/sobel.hpp
similarity index 65%
copy from src/backend/cpu/sobel.cpp
copy to src/backend/cpu/kernel/sobel.hpp
index ba47ba9..49d33cd 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/kernel/sobel.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,26 +7,21 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/dim4.hpp>
+#pragma once
#include <af/defines.h>
-#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <sobel.hpp>
-#include <convolve.hpp>
-#include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
+#include <cassert>
namespace cpu
{
+namespace kernel
+{
template<typename Ti, typename To, bool isDX>
void derivative(Array<To> output, const Array<Ti> input)
{
- const dim4 dims = input.dims();
- const dim4 strides = input.strides();
+ const af::dim4 dims = input.dims();
+ const af::dim4 strides = input.strides();
To* optr = output.get();
const Ti* iptr = input.get();
@@ -87,33 +82,5 @@ void derivative(Array<To> output, const Array<Ti> input)
}
}
-template<typename Ti, typename To>
-std::pair< Array<To>, Array<To> >
-sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
-{
- img.eval();
- // ket_size is for future proofing, this argument is not used
- // currently
- Array<To> dx = createEmptyArray<To>(img.dims());
- Array<To> dy = createEmptyArray<To>(img.dims());
-
- getQueue().enqueue(derivative<Ti, To, true >, dx, img);
- getQueue().enqueue(derivative<Ti, To, false>, dy, img);
-
- return std::make_pair(dx, dy);
}
-
-#define INSTANTIATE(Ti, To) \
- template std::pair< Array<To>, Array<To> > \
- sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size);
-
-INSTANTIATE(float , float)
-INSTANTIATE(double, double)
-INSTANTIATE(int , int)
-INSTANTIATE(uint , int)
-INSTANTIATE(char , int)
-INSTANTIATE(uchar , int)
-INSTANTIATE(short , int)
-INSTANTIATE(ushort, int)
-
}
diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp
new file mode 100644
index 0000000..cba07fa
--- /dev/null
+++ b/src/backend/cpu/kernel/sort.hpp
@@ -0,0 +1,51 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+#include <algorithm>
+#include <numeric>
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+// Based off of http://stackoverflow.com/a/12399290
+template<typename T, bool isAscending>
+void sort0(Array<T> val)
+{
+ // initialize original index locations
+ T *val_ptr = val.get();
+
+ function<bool(T, T)> op = std::greater<T>();
+ if(isAscending) { op = std::less<T>(); }
+
+ T *comp_ptr = nullptr;
+ for(dim_t w = 0; w < val.dims()[3]; w++) {
+ dim_t valW = w * val.strides()[3];
+ for(dim_t z = 0; z < val.dims()[2]; z++) {
+ dim_t valWZ = valW + z * val.strides()[2];
+ for(dim_t y = 0; y < val.dims()[1]; y++) {
+
+ dim_t valOffset = valWZ + y * val.strides()[1];
+
+ comp_ptr = val_ptr + valOffset;
+ std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
+ }
+ }
+ }
+ return;
+}
+
+}
+}
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/kernel/sort_by_key.hpp
similarity index 52%
copy from src/backend/cpu/sort_by_key.cpp
copy to src/backend/cpu/kernel/sort_by_key.hpp
index d2ebd42..77713a7 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/kernel/sort_by_key.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,37 +7,26 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <sort_by_key.hpp>
#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <algorithm>
#include <numeric>
#include <queue>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
-using std::queue;
-using std::async;
+#include <err_cpu.hpp>
namespace cpu
{
-
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
+namespace kernel
+{
template<typename Tk, typename Tv, bool isAscending>
void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
const Array<Tk> ikey, const Array<Tv> ival)
{
- function<bool(Tk, Tk)> op = greater<Tk>();
- if(isAscending) { op = less<Tk>(); }
+ function<bool(Tk, Tk)> op = std::greater<Tk>();
+ if(isAscending) { op = std::less<Tk>(); }
// Get pointers and initialize original index locations
uint *oidx_ptr = oidx.get();
@@ -92,58 +81,5 @@ void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
return;
}
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename Tk, typename Tv, bool isAscending>
-void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
- const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
-{
- ikey.eval();
- ival.eval();
-
- okey = createEmptyArray<Tk>(ikey.dims());
- oval = createEmptyArray<Tv>(ival.dims());
- Array<uint> oidx = createValueArray(ikey.dims(), 0u);
- oidx.eval();
-
- switch(dim) {
- case 0: getQueue().enqueue(sort0_by_key<Tk, Tv, isAscending>,
- okey, oval, oidx, ikey, ival); break;
- default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
- }
}
-
-#define INSTANTIATE(Tk, Tv) \
- template void \
- sort_by_key<Tk, Tv, true>(Array<Tk> &okey, Array<Tv> &oval, \
- const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim); \
- template void \
- sort_by_key<Tk, Tv,false>(Array<Tk> &okey, Array<Tv> &oval, \
- const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim); \
-
-#define INSTANTIATE1(Tk) \
- INSTANTIATE(Tk, float) \
- INSTANTIATE(Tk, double) \
- INSTANTIATE(Tk, int) \
- INSTANTIATE(Tk, uint) \
- INSTANTIATE(Tk, char) \
- INSTANTIATE(Tk, uchar) \
- INSTANTIATE(Tk, short) \
- INSTANTIATE(Tk, ushort) \
- INSTANTIATE(Tk, intl) \
- INSTANTIATE(Tk, uintl) \
-
-
-INSTANTIATE1(float)
-INSTANTIATE1(double)
-INSTANTIATE1(int)
-INSTANTIATE1(uint)
-INSTANTIATE1(char)
-INSTANTIATE1(uchar)
-INSTANTIATE1(short)
-INSTANTIATE1(ushort)
-INSTANTIATE1(intl)
-INSTANTIATE1(uintl)
-
}
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/kernel/sort_index.hpp
similarity index 52%
copy from src/backend/cpu/sort_index.cpp
copy to src/backend/cpu/kernel/sort_index.hpp
index f941534..d2de05a 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/kernel/sort_index.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,35 +7,28 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <sort_index.hpp>
#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <algorithm>
#include <numeric>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
+#include <err_cpu.hpp>
namespace cpu
{
+namespace kernel
+{
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
template<typename T, bool isAscending>
-void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
+void sort0_index(Array<T> val, Array<uint> idx, const Array<T> in)
{
// initialize original index locations
uint *idx_ptr = idx.get();
T *val_ptr = val.get();
const T *in_ptr = in.get();
- function<bool(T, T)> op = greater<T>();
- if(isAscending) { op = less<T>(); }
+ function<bool(T, T)> op = std::greater<T>();
+ if(isAscending) { op = std::less<T>(); }
std::vector<uint> seq_vec(idx.dims()[0]);
std::iota(seq_vec.begin(), seq_vec.end(), 0);
@@ -73,39 +66,5 @@ void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
return;
}
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename T, bool isAscending>
-void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
-{
- in.eval();
-
- val = createEmptyArray<T>(in.dims());
- idx = createEmptyArray<uint>(in.dims());
- switch(dim) {
- case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
- default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
- }
}
-
-#define INSTANTIATE(T) \
- template void sort_index<T, true>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
- const uint dim); \
- template void sort_index<T,false>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
- const uint dim); \
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-//INSTANTIATE(cfloat)
-//INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(char)
-INSTANTIATE(uchar)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-
}
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/kernel/susan.hpp
similarity index 55%
copy from src/backend/cpu/susan.cpp
copy to src/backend/cpu/kernel/susan.hpp
index c278908..f543967 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/kernel/susan.hpp
@@ -1,25 +1,20 @@
/*******************************************************
- * Copyright (c) 2015, Arrayfire
- * all rights reserved.
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
*
- * This file is distributed under 3-clause bsd license.
- * the complete license agreement can be obtained at:
- * http://Arrayfire.com/licenses/bsd-3-clause
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/features.h>
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <cmath>
-#include <math.hpp>
-#include <memory>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::features;
-using std::shared_ptr;
namespace cpu
{
+namespace kernel
+{
template<typename T>
void susan_responses(Array<T> output, const Array<T> input,
@@ -100,59 +95,5 @@ void non_maximal(Array<float> xcoords, Array<float> ycoords, Array<float> respon
}
}
-template<typename T>
-unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
- const Array<T> &in,
- const unsigned radius, const float diff_thr, const float geom_thr,
- const float feature_ratio, const unsigned edge)
-{
- in.eval();
-
- dim4 idims = in.dims();
- const unsigned corner_lim = in.elements() * feature_ratio;
-
- auto x_corners = createEmptyArray<float>(dim4(corner_lim));
- auto y_corners = createEmptyArray<float>(dim4(corner_lim));
- auto resp_corners = createEmptyArray<float>(dim4(corner_lim));
- auto response = createEmptyArray<T>(dim4(in.elements()));
- auto corners_found= std::shared_ptr<unsigned>(memAlloc<unsigned>(1), memFree<unsigned>);
- corners_found.get()[0] = 0;
-
- getQueue().enqueue(susan_responses<T>, response, in, idims[0], idims[1],
- radius, diff_thr, geom_thr, edge);
- getQueue().enqueue(non_maximal<T>, x_corners, y_corners, resp_corners, corners_found,
- idims[0], idims[1], response, edge, corner_lim);
- getQueue().sync();
-
- const unsigned corners_out = min((corners_found.get())[0], corner_lim);
- if (corners_out == 0) {
- x_out = createEmptyArray<float>(dim4());
- y_out = createEmptyArray<float>(dim4());
- resp_out = createEmptyArray<float>(dim4());
- return 0;
- } else {
- x_out = x_corners;
- y_out = y_corners;
- resp_out = resp_corners;
- x_out.resetDims(dim4(corners_out));
- y_out.resetDims(dim4(corners_out));
- resp_out.resetDims(dim4(corners_out));
- return corners_out;
- }
}
-
-#define INSTANTIATE(T) \
-template unsigned susan<T>(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out, \
- const Array<T> &in, const unsigned radius, const float diff_thr, \
- const float geom_thr, const float feature_ratio, const unsigned edge);
-
-INSTANTIATE(float )
-INSTANTIATE(double)
-INSTANTIATE(char )
-INSTANTIATE(int )
-INSTANTIATE(uint )
-INSTANTIATE(uchar )
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/kernel/tile.hpp b/src/backend/cpu/kernel/tile.hpp
new file mode 100644
index 0000000..3ad3009
--- /dev/null
+++ b/src/backend/cpu/kernel/tile.hpp
@@ -0,0 +1,55 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void tile(Array<T> out, const Array<T> in)
+{
+
+ T* outPtr = out.get();
+ const T* inPtr = in.get();
+
+ const af::dim4 iDims = in.dims();
+ const af::dim4 oDims = out.dims();
+ const af::dim4 ist = in.strides();
+ const af::dim4 ost = out.strides();
+
+ for(dim_t ow = 0; ow < oDims[3]; ow++) {
+ const dim_t iw = ow % iDims[3];
+ const dim_t iW = iw * ist[3];
+ const dim_t oW = ow * ost[3];
+ for(dim_t oz = 0; oz < oDims[2]; oz++) {
+ const dim_t iz = oz % iDims[2];
+ const dim_t iZW = iW + iz * ist[2];
+ const dim_t oZW = oW + oz * ost[2];
+ for(dim_t oy = 0; oy < oDims[1]; oy++) {
+ const dim_t iy = oy % iDims[1];
+ const dim_t iYZW = iZW + iy * ist[1];
+ const dim_t oYZW = oZW + oy * ost[1];
+ for(dim_t ox = 0; ox < oDims[0]; ox++) {
+ const dim_t ix = ox % iDims[0];
+ const dim_t iMem = iYZW + ix;
+ const dim_t oMem = oYZW + ox;
+ outPtr[oMem] = inPtr[iMem];
+ }
+ }
+ }
+ }
+}
+
+}
+}
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/kernel/transform.hpp
similarity index 62%
copy from src/backend/cpu/transform.cpp
copy to src/backend/cpu/kernel/transform.hpp
index a7287ce..d97613a 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/kernel/transform.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,17 +7,15 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <transform.hpp>
-#include <math.hpp>
-#include <stdexcept>
#include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-#include "transform_interp.hpp"
namespace cpu
{
+namespace kernel
+{
template <typename T>
void calc_affine_inverse(T *txo, const T *txi)
@@ -48,8 +46,8 @@ void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse)
}
template<typename T, af_interp_type method>
-void transform_(Array<T> output, const Array<T> input,
- const Array<float> transform, const bool inverse)
+void transform(Array<T> output, const Array<T> input,
+ const Array<float> transform, const bool inverse)
{
const af::dim4 idims = input.dims();
const af::dim4 odims = output.dims();
@@ -103,49 +101,5 @@ void transform_(Array<T> output, const Array<T> input,
}
}
-template<typename T>
-Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
- const af_interp_type method, const bool inverse)
-{
- in.eval();
- transform.eval();
-
- Array<T> out = createEmptyArray<T>(odims);
-
- switch(method) {
- case AF_INTERP_NEAREST :
- getQueue().enqueue(transform_<T, AF_INTERP_NEAREST >, out, in, transform, inverse);
- break;
- case AF_INTERP_BILINEAR:
- getQueue().enqueue(transform_<T, AF_INTERP_BILINEAR>, out, in, transform, inverse);
- break;
- case AF_INTERP_LOWER :
- getQueue().enqueue(transform_<T, AF_INTERP_LOWER >, out, in, transform, inverse);
- break;
- default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break;
- }
-
- return out;
}
-
-
-#define INSTANTIATE(T) \
-template Array<T> transform(const Array<T> &in, const Array<float> &transform, \
- const af::dim4 &odims, const af_interp_type method, \
- const bool inverse);
-
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/kernel/transpose.hpp
similarity index 65%
copy from src/backend/cpu/transpose.cpp
copy to src/backend/cpu/kernel/transpose.hpp
index 7e7eec1..576de87 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/kernel/transpose.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,30 +7,16 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <af/dim4.hpp>
+#pragma once
#include <af/defines.h>
-#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <transpose.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-#include <utility>
-#include <cassert>
-
-using af::dim4;
+#include <utility.hpp>
+#include <err_cpu.hpp>
namespace cpu
{
-
-static inline unsigned getIdx(const dim4 &strides,
- int i, int j = 0, int k = 0, int l = 0)
+namespace kernel
{
- return (l * strides[3] +
- k * strides[2] +
- j * strides[1] +
- i );
-}
template<typename T>
T getConjugate(const T &in)
@@ -52,7 +38,7 @@ cdouble getConjugate(const cdouble &in)
}
template<typename T, bool conjugate>
-void transpose_(Array<T> output, const Array<T> input)
+void transpose(Array<T> output, const Array<T> input)
{
const dim4 odims = output.dims();
const dim4 ostrides = output.strides();
@@ -86,24 +72,9 @@ void transpose_(Array<T> output, const Array<T> input)
}
template<typename T>
-void transpose_(Array<T> out, const Array<T> in, const bool conjugate)
-{
- return (conjugate ? transpose_<T, true>(out, in) : transpose_<T, false>(out, in));
-}
-
-template<typename T>
-Array<T> transpose(const Array<T> &in, const bool conjugate)
+void transpose(Array<T> out, const Array<T> in, const bool conjugate)
{
- in.eval();
-
- const dim4 inDims = in.dims();
- const dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]);
- // create an array with first two dimensions swapped
- Array<T> out = createEmptyArray<T>(outDims);
-
- getQueue().enqueue(transpose_<T>, out, in, conjugate);
-
- return out;
+ return (conjugate ? transpose<T, true>(out, in) : transpose<T, false>(out, in));
}
template<typename T, bool conjugate>
@@ -142,33 +113,10 @@ void transpose_inplace(Array<T> input)
}
template<typename T>
-void transpose_inplace_(Array<T> in, const bool conjugate)
+void transpose_inplace(Array<T> in, const bool conjugate)
{
return (conjugate ? transpose_inplace<T, true >(in) : transpose_inplace<T, false>(in));
}
-template<typename T>
-void transpose_inplace(Array<T> &in, const bool conjugate)
-{
- in.eval();
- getQueue().enqueue(transpose_inplace_<T>, in, conjugate);
}
-
-#define INSTANTIATE(T) \
- template Array<T> transpose(const Array<T> &in, const bool conjugate); \
- template void transpose_inplace(Array<T> &in, const bool conjugate);
-
-INSTANTIATE(float )
-INSTANTIATE(cfloat )
-INSTANTIATE(double )
-INSTANTIATE(cdouble)
-INSTANTIATE(char )
-INSTANTIATE(int )
-INSTANTIATE(uint )
-INSTANTIATE(uchar )
-INSTANTIATE(intl )
-INSTANTIATE(uintl )
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/kernel/triangle.hpp b/src/backend/cpu/kernel/triangle.hpp
new file mode 100644
index 0000000..7059de5
--- /dev/null
+++ b/src/backend/cpu/kernel/triangle.hpp
@@ -0,0 +1,61 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool is_upper, bool is_unit_diag>
+void triangle(Array<T> out, const Array<T> in)
+{
+ T *o = out.get();
+ const T *i = in.get();
+
+ af::dim4 odm = out.dims();
+
+ af::dim4 ost = out.strides();
+ af::dim4 ist = in.strides();
+
+ for(dim_t ow = 0; ow < odm[3]; ow++) {
+ const dim_t oW = ow * ost[3];
+ const dim_t iW = ow * ist[3];
+
+ for(dim_t oz = 0; oz < odm[2]; oz++) {
+ const dim_t oZW = oW + oz * ost[2];
+ const dim_t iZW = iW + oz * ist[2];
+
+ for(dim_t oy = 0; oy < odm[1]; oy++) {
+ const dim_t oYZW = oZW + oy * ost[1];
+ const dim_t iYZW = iZW + oy * ist[1];
+
+ for(dim_t ox = 0; ox < odm[0]; ox++) {
+ const dim_t oMem = oYZW + ox;
+ const dim_t iMem = iYZW + ox;
+
+ bool cond = is_upper ? (oy >= ox) : (oy <= ox);
+ bool do_unit_diag = (is_unit_diag && ox == oy);
+ if(cond) {
+ o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
+ } else {
+ o[oMem] = scalar<T>(0);
+ }
+
+ }
+ }
+ }
+ }
+}
+
+}
+}
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/kernel/unwrap.hpp
similarity index 63%
copy from src/backend/cpu/unwrap.cpp
copy to src/backend/cpu/kernel/unwrap.hpp
index 41423c7..1d996ff 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/kernel/unwrap.hpp
@@ -1,5 +1,5 @@
/*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
@@ -7,17 +7,15 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <unwrap.hpp>
-#include <stdexcept>
#include <err_cpu.hpp>
-#include <dispatch.hpp>
-#include <math.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
namespace cpu
{
+namespace kernel
+{
template<typename T, int d>
void unwrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
@@ -79,50 +77,5 @@ void unwrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
}
}
-template<typename T>
-Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
-{
- in.eval();
-
- af::dim4 idims = in.dims();
- dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
- dim_t ny = (idims[1] + 2 * py - wy) / sy + 1;
-
- af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
-
- if (!is_column) {
- std::swap(odims[0], odims[1]);
- }
-
- Array<T> outArray = createEmptyArray<T>(odims);
-
- if (is_column) {
- getQueue().enqueue(unwrap_dim<T, 1>, outArray, in, wx, wy, sx, sy, px, py);
- } else {
- getQueue().enqueue(unwrap_dim<T, 0>, outArray, in, wx, wy, sx, sy, px, py);
- }
-
- return outArray;
}
-
-
-#define INSTANTIATE(T) \
- template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
-
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/kernel/wrap.hpp
similarity index 62%
copy from src/backend/cpu/wrap.cpp
copy to src/backend/cpu/kernel/wrap.hpp
index 3ff54de..70be3ad 100644
--- a/src/backend/cpu/wrap.cpp
+++ b/src/backend/cpu/kernel/wrap.hpp
@@ -7,17 +7,15 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <af/defines.h>
#include <Array.hpp>
-#include <wrap.hpp>
-#include <stdexcept>
#include <err_cpu.hpp>
-#include <dispatch.hpp>
-#include <math.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
namespace cpu
{
+namespace kernel
+{
template<typename T, int d>
void wrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
@@ -78,50 +76,5 @@ void wrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
}
}
-template<typename T>
-Array<T> wrap(const Array<T> &in,
- const dim_t ox, const dim_t oy,
- const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy,
- const dim_t px, const dim_t py,
- const bool is_column)
-{
- af::dim4 idims = in.dims();
- af::dim4 odims(ox, oy, idims[2], idims[3]);
-
- Array<T> out = createValueArray<T>(odims, scalar<T>(0));
- out.eval();
- in.eval();
-
- if (is_column) {
- getQueue().enqueue(wrap_dim<T, 1>, out, in, wx, wy, sx, sy, px, py);
- } else {
- getQueue().enqueue(wrap_dim<T, 0>, out, in, wx, wy, sx, sy, px, py);
- }
-
- return out;
}
-
-
-#define INSTANTIATE(T) \
- template Array<T> wrap<T> (const Array<T> &in, \
- const dim_t ox, const dim_t oy, \
- const dim_t wx, const dim_t wy, \
- const dim_t sx, const dim_t sy, \
- const dim_t px, const dim_t py, \
- const bool is_column);
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
}
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
index b6f50c2..8292562 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -11,139 +11,16 @@
#include <af/defines.h>
#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <err_cpu.hpp>
#include <handle.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/nearest_neighbour.hpp>
using af::dim4;
namespace cpu
{
-#if defined(_WIN32) || defined(_MSC_VER)
-
-#include <intrin.h>
-#define __builtin_popcount __popcnt
-
-#endif
-
-template<typename T, typename To, af_match_type dist_type>
-struct dist_op
-{
- To operator()(T v1, T v2)
- {
- return v1 - v2; // Garbage distance
- }
-};
-
-template<typename T, typename To>
-struct dist_op<T, To, AF_SAD>
-{
- To operator()(T v1, T v2)
- {
- return std::abs((double)v1 - (double)v2);
- }
-};
-
-template<typename T, typename To>
-struct dist_op<T, To, AF_SSD>
-{
- To operator()(T v1, T v2)
- {
- return (v1 - v2) * (v1 - v2);
- }
-};
-
-template<typename To>
-struct dist_op<uint, To, AF_SHD>
-{
- To operator()(uint v1, uint v2)
- {
- return __builtin_popcount(v1 ^ v2);
- }
-};
-
-template<typename To>
-struct dist_op<uintl, To, AF_SHD>
-{
- To operator()(uintl v1, uintl v2)
- {
- return __builtin_popcount(v1 ^ v2);
- }
-};
-
-template<typename To>
-struct dist_op<uchar, To, AF_SHD>
-{
- To operator()(uchar v1, uchar v2)
- {
- return __builtin_popcount(v1 ^ v2);
- }
-};
-
-template<typename To>
-struct dist_op<ushort, To, AF_SHD>
-{
- To operator()(ushort v1, ushort v2)
- {
- return __builtin_popcount(v1 ^ v2);
- }
-};
-
-template<typename T, typename To, af_match_type dist_type>
-void nearest_neighbour_(Array<uint> idx, Array<To> dist,
- const Array<T> query, const Array<T> train,
- const uint dist_dim, const uint n_dist)
-{
- uint sample_dim = (dist_dim == 0) ? 1 : 0;
- const dim4 qDims = query.dims();
- const dim4 tDims = train.dims();
-
- const unsigned distLength = qDims[dist_dim];
- const unsigned nQuery = qDims[sample_dim];
- const unsigned nTrain = tDims[sample_dim];
-
- const T* qPtr = query.get();
- const T* tPtr = train.get();
- uint* iPtr = idx.get();
- To* dPtr = dist.get();
-
- dist_op<T, To, dist_type> op;
-
- for (unsigned i = 0; i < nQuery; i++) {
- To best_dist = limit_max<To>();
- unsigned best_idx = 0;
-
- for (unsigned j = 0; j < nTrain; j++) {
- To local_dist = 0;
- for (unsigned k = 0; k < distLength; k++) {
- size_t qIdx, tIdx;
- if (sample_dim == 0) {
- qIdx = k * qDims[0] + i;
- tIdx = k * tDims[0] + j;
- }
- else {
- qIdx = i * qDims[0] + k;
- tIdx = j * tDims[0] + k;
- }
-
- local_dist += op(qPtr[qIdx], tPtr[tIdx]);
- }
-
- if (local_dist < best_dist) {
- best_dist = local_dist;
- best_idx = j;
- }
- }
-
- size_t oIdx;
- oIdx = i;
- iPtr[oIdx] = best_idx;
- dPtr[oIdx] = best_dist;
- }
-}
-
template<typename T, typename To>
void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
const Array<T>& query, const Array<T>& train,
@@ -166,13 +43,13 @@ void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
switch(dist_type) {
case AF_SAD:
- getQueue().enqueue(nearest_neighbour_<T, To, AF_SAD>, idx, dist, query, train, dist_dim, n_dist);
+ getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SAD>, idx, dist, query, train, dist_dim, n_dist);
break;
case AF_SSD:
- getQueue().enqueue(nearest_neighbour_<T, To, AF_SSD>, idx, dist, query, train, dist_dim, n_dist);
+ getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SSD>, idx, dist, query, train, dist_dim, n_dist);
break;
case AF_SHD:
- getQueue().enqueue(nearest_neighbour_<T, To, AF_SHD>, idx, dist, query, train, dist_dim, n_dist);
+ getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SHD>, idx, dist, query, train, dist_dim, n_dist);
break;
default:
AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp
index 4b6629c..00fe820 100644
--- a/src/backend/cpu/orb.cpp
+++ b/src/backend/cpu/orb.cpp
@@ -11,7 +11,6 @@
#include <af/defines.h>
#include <ArrayInfo.hpp>
#include <Array.hpp>
-#include <err_cpu.hpp>
#include <handle.hpp>
#include <resize.hpp>
#include <fast.hpp>
@@ -21,520 +20,13 @@
#include <cstring>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/orb.hpp>
using af::dim4;
namespace cpu
{
-static const float PI_VAL = 3.14159265358979323846f;
-
-// Reference pattern, generated for a patch size of 31x31, as suggested by
-// original ORB paper
-#define REF_PAT_SIZE 31
-#define REF_PAT_SAMPLES 256
-#define REF_PAT_COORDS 4
-#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS)
-
-// Current reference pattern was borrowed from OpenCV, to build a pattern with
-// similar quality, a training process must be applied, as described in
-// sections 4.2 and 4.3 of the original ORB paper.
-const int ref_pat[REF_PAT_LENGTH] = {
- 8,-3, 9,5,
- 4,2, 7,-12,
- -11,9, -8,2,
- 7,-12, 12,-13,
- 2,-13, 2,12,
- 1,-7, 1,6,
- -2,-10, -2,-4,
- -13,-13, -11,-8,
- -13,-3, -12,-9,
- 10,4, 11,9,
- -13,-8, -8,-9,
- -11,7, -9,12,
- 7,7, 12,6,
- -4,-5, -3,0,
- -13,2, -12,-3,
- -9,0, -7,5,
- 12,-6, 12,-1,
- -3,6, -2,12,
- -6,-13, -4,-8,
- 11,-13, 12,-8,
- 4,7, 5,1,
- 5,-3, 10,-3,
- 3,-7, 6,12,
- -8,-7, -6,-2,
- -2,11, -1,-10,
- -13,12, -8,10,
- -7,3, -5,-3,
- -4,2, -3,7,
- -10,-12, -6,11,
- 5,-12, 6,-7,
- 5,-6, 7,-1,
- 1,0, 4,-5,
- 9,11, 11,-13,
- 4,7, 4,12,
- 2,-1, 4,4,
- -4,-12, -2,7,
- -8,-5, -7,-10,
- 4,11, 9,12,
- 0,-8, 1,-13,
- -13,-2, -8,2,
- -3,-2, -2,3,
- -6,9, -4,-9,
- 8,12, 10,7,
- 0,9, 1,3,
- 7,-5, 11,-10,
- -13,-6, -11,0,
- 10,7, 12,1,
- -6,-3, -6,12,
- 10,-9, 12,-4,
- -13,8, -8,-12,
- -13,0, -8,-4,
- 3,3, 7,8,
- 5,7, 10,-7,
- -1,7, 1,-12,
- 3,-10, 5,6,
- 2,-4, 3,-10,
- -13,0, -13,5,
- -13,-7, -12,12,
- -13,3, -11,8,
- -7,12, -4,7,
- 6,-10, 12,8,
- -9,-1, -7,-6,
- -2,-5, 0,12,
- -12,5, -7,5,
- 3,-10, 8,-13,
- -7,-7, -4,5,
- -3,-2, -1,-7,
- 2,9, 5,-11,
- -11,-13, -5,-13,
- -1,6, 0,-1,
- 5,-3, 5,2,
- -4,-13, -4,12,
- -9,-6, -9,6,
- -12,-10, -8,-4,
- 10,2, 12,-3,
- 7,12, 12,12,
- -7,-13, -6,5,
- -4,9, -3,4,
- 7,-1, 12,2,
- -7,6, -5,1,
- -13,11, -12,5,
- -3,7, -2,-6,
- 7,-8, 12,-7,
- -13,-7, -11,-12,
- 1,-3, 12,12,
- 2,-6, 3,0,
- -4,3, -2,-13,
- -1,-13, 1,9,
- 7,1, 8,-6,
- 1,-1, 3,12,
- 9,1, 12,6,
- -1,-9, -1,3,
- -13,-13, -10,5,
- 7,7, 10,12,
- 12,-5, 12,9,
- 6,3, 7,11,
- 5,-13, 6,10,
- 2,-12, 2,3,
- 3,8, 4,-6,
- 2,6, 12,-13,
- 9,-12, 10,3,
- -8,4, -7,9,
- -11,12, -4,-6,
- 1,12, 2,-8,
- 6,-9, 7,-4,
- 2,3, 3,-2,
- 6,3, 11,0,
- 3,-3, 8,-8,
- 7,8, 9,3,
- -11,-5, -6,-4,
- -10,11, -5,10,
- -5,-8, -3,12,
- -10,5, -9,0,
- 8,-1, 12,-6,
- 4,-6, 6,-11,
- -10,12, -8,7,
- 4,-2, 6,7,
- -2,0, -2,12,
- -5,-8, -5,2,
- 7,-6, 10,12,
- -9,-13, -8,-8,
- -5,-13, -5,-2,
- 8,-8, 9,-13,
- -9,-11, -9,0,
- 1,-8, 1,-2,
- 7,-4, 9,1,
- -2,1, -1,-4,
- 11,-6, 12,-11,
- -12,-9, -6,4,
- 3,7, 7,12,
- 5,5, 10,8,
- 0,-4, 2,8,
- -9,12, -5,-13,
- 0,7, 2,12,
- -1,2, 1,7,
- 5,11, 7,-9,
- 3,5, 6,-8,
- -13,-4, -8,9,
- -5,9, -3,-3,
- -4,-7, -3,-12,
- 6,5, 8,0,
- -7,6, -6,12,
- -13,6, -5,-2,
- 1,-10, 3,10,
- 4,1, 8,-4,
- -2,-2, 2,-13,
- 2,-12, 12,12,
- -2,-13, 0,-6,
- 4,1, 9,3,
- -6,-10, -3,-5,
- -3,-13, -1,1,
- 7,5, 12,-11,
- 4,-2, 5,-7,
- -13,9, -9,-5,
- 7,1, 8,6,
- 7,-8, 7,6,
- -7,-4, -7,1,
- -8,11, -7,-8,
- -13,6, -12,-8,
- 2,4, 3,9,
- 10,-5, 12,3,
- -6,-5, -6,7,
- 8,-3, 9,-8,
- 2,-12, 2,8,
- -11,-2, -10,3,
- -12,-13, -7,-9,
- -11,0, -10,-5,
- 5,-3, 11,8,
- -2,-13, -1,12,
- -1,-8, 0,9,
- -13,-11, -12,-5,
- -10,-2, -10,11,
- -3,9, -2,-13,
- 2,-3, 3,2,
- -9,-13, -4,0,
- -4,6, -3,-10,
- -4,12, -2,-7,
- -6,-11, -4,9,
- 6,-3, 6,11,
- -13,11, -5,5,
- 11,11, 12,6,
- 7,-5, 12,-2,
- -1,12, 0,7,
- -4,-8, -3,-2,
- -7,1, -6,7,
- -13,-12, -8,-13,
- -7,-2, -6,-8,
- -8,5, -6,-9,
- -5,-1, -4,5,
- -13,7, -8,10,
- 1,5, 5,-13,
- 1,0, 10,-13,
- 9,12, 10,-1,
- 5,-8, 10,-9,
- -1,11, 1,-13,
- -9,-3, -6,2,
- -1,-10, 1,12,
- -13,1, -8,-10,
- 8,-11, 10,-6,
- 2,-13, 3,-6,
- 7,-13, 12,-9,
- -10,-10, -5,-7,
- -10,-8, -8,-13,
- 4,-6, 8,5,
- 3,12, 8,-13,
- -4,2, -3,-3,
- 5,-13, 10,-12,
- 4,-13, 5,-1,
- -9,9, -4,3,
- 0,3, 3,-9,
- -12,1, -6,1,
- 3,2, 4,-8,
- -10,-10, -10,9,
- 8,-13, 12,12,
- -8,-12, -6,-5,
- 2,2, 3,7,
- 10,6, 11,-8,
- 6,8, 8,-12,
- -7,10, -6,5,
- -3,-9, -3,9,
- -1,-13, -1,5,
- -3,-7, -3,4,
- -8,-2, -8,3,
- 4,2, 12,12,
- 2,-5, 3,11,
- 6,-9, 11,-13,
- 3,-1, 7,12,
- 11,-1, 12,4,
- -3,0, -3,6,
- 4,-11, 4,12,
- 2,-4, 2,1,
- -10,-6, -8,1,
- -13,7, -11,1,
- -13,12, -11,-13,
- 6,0, 11,-13,
- 0,-1, 1,4,
- -13,3, -9,-2,
- -9,8, -6,-3,
- -13,-6, -8,-2,
- 5,-9, 8,10,
- 2,7, 3,-9,
- -1,-6, -1,-1,
- 9,5, 11,-2,
- 11,-3, 12,-8,
- 3,0, 3,5,
- -1,4, 0,10,
- 3,-6, 4,5,
- -13,0, -10,5,
- 5,8, 12,11,
- 8,9, 9,-6,
- 7,-4, 8,-12,
- -10,4, -10,9,
- 7,3, 12,4,
- 9,-7, 10,-2,
- 7,0, 12,-2,
- -1,-6, 0,-11,
-};
-
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
- if(!(sigma>0)) sigma = 0.25*dim;
-
- T sum = (T)0;
- for(int i=0;i<dim;i++)
- {
- int x = i-(dim-1)/2;
- T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
- out[i] = el;
- sum += el;
- }
-
- for(int k=0;k<dim;k++)
- out[k] /= sum;
-}
-
-template<typename T>
-void keep_features(
- float* x_out,
- float* y_out,
- float* score_out,
- float* size_out,
- const float* x_in,
- const float* y_in,
- const float* score_in,
- const unsigned* score_idx,
- const float* size_in,
- const unsigned n_feat)
-{
- // Keep only the first n_feat features
- for (unsigned f = 0; f < n_feat; f++) {
- x_out[f] = x_in[score_idx[f]];
- y_out[f] = y_in[score_idx[f]];
- score_out[f] = score_in[f];
- if (size_in != nullptr && size_out != nullptr)
- size_out[f] = size_in[score_idx[f]];
- }
-}
-
-template<typename T, bool use_scl>
-void harris_response(
- float* x_out,
- float* y_out,
- float* score_out,
- float* size_out,
- const float* x_in,
- const float* y_in,
- const float* scl_in,
- const unsigned total_feat,
- unsigned* usable_feat,
- const Array<T>& image,
- const unsigned block_size,
- const float k_thr,
- const unsigned patch_size)
-{
- const af::dim4 idims = image.dims();
- const T* image_ptr = image.get();
- for (unsigned f = 0; f < total_feat; f++) {
- unsigned x, y;
- float scl = 1.f;
- if (use_scl) {
- // Update x and y coordinates according to scale
- scl = scl_in[f];
- x = (unsigned)round(x_in[f] * scl);
- y = (unsigned)round(y_in[f] * scl);
- }
- else {
- x = (unsigned)round(x_in[f]);
- y = (unsigned)round(y_in[f]);
- }
-
- // Round feature size to nearest odd integer
- float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
-
- // Avoid keeping features that might be too wide and might not fit on
- // the image, sqrt(2.f) is the radius when angle is 45 degrees and
- // represents widest case possible
- unsigned patch_r = ceil(size * sqrt(2.f) / 2.f);
- if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r)
- continue;
-
- unsigned r = block_size / 2;
-
- float ixx = 0.f, iyy = 0.f, ixy = 0.f;
- unsigned block_size_sq = block_size * block_size;
- for (unsigned k = 0; k < block_size_sq; k++) {
- int i = k / block_size - r;
- int j = k % block_size - r;
-
- // Calculate local x and y derivatives
- float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j];
- float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1];
-
- // Accumulate second order derivatives
- ixx += ix*ix;
- iyy += iy*iy;
- ixy += ix*iy;
- }
-
- unsigned idx = *usable_feat;
- *usable_feat += 1;
- float tr = ixx + iyy;
- float det = ixx*iyy - ixy*ixy;
-
- // Calculate Harris responses
- float resp = det - k_thr * (tr*tr);
-
- // Scale factor
- // TODO: improve response scaling
- float rscale = 0.001f;
- rscale = rscale * rscale * rscale * rscale;
-
- x_out[idx] = x;
- y_out[idx] = y;
- score_out[idx] = resp * rscale;
- if (use_scl)
- size_out[idx] = size;
- }
-}
-
-template<typename T>
-void centroid_angle(
- const float* x_in,
- const float* y_in,
- float* orientation_out,
- const unsigned total_feat,
- const Array<T>& image,
- const unsigned patch_size)
-{
- const af::dim4 idims = image.dims();
- const T* image_ptr = image.get();
- for (unsigned f = 0; f < total_feat; f++) {
- unsigned x = (unsigned)round(x_in[f]);
- unsigned y = (unsigned)round(y_in[f]);
-
- unsigned r = patch_size / 2;
- if (x < r || y < r || x > idims[1] - r || y > idims[0] - r)
- continue;
-
- T m01 = (T)0, m10 = (T)0;
- unsigned patch_size_sq = patch_size * patch_size;
- for (unsigned k = 0; k < patch_size_sq; k++) {
- int i = k / patch_size - r;
- int j = k % patch_size - r;
-
- // Calculate first order moments
- T p = image_ptr[(x+i) * idims[0] + y+j];
- m01 += j * p;
- m10 += i * p;
- }
-
- float angle = atan2(m01, m10);
- orientation_out[f] = angle;
- }
-}
-
-template<typename T>
-inline T get_pixel(
- unsigned x,
- unsigned y,
- const float ori,
- const unsigned size,
- const int dist_x,
- const int dist_y,
- const Array<T>& image,
- const unsigned patch_size)
-{
- const af::dim4 idims = image.dims();
- const T* image_ptr = image.get();
- float ori_sin = sin(ori);
- float ori_cos = cos(ori);
- float patch_scl = (float)size / (float)patch_size;
-
- // Calculate point coordinates based on orientation and size
- x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
- y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
-
- return image_ptr[x * idims[0] + y];
-}
-
-template<typename T>
-void extract_orb(
- unsigned* desc_out,
- const unsigned n_feat,
- float* x_in_out,
- float* y_in_out,
- const float* ori_in,
- float* size_out,
- const Array<T>& image,
- const float scl,
- const unsigned patch_size)
-{
- const af::dim4 idims = image.dims();
- for (unsigned f = 0; f < n_feat; f++) {
- unsigned x = (unsigned)round(x_in_out[f]);
- unsigned y = (unsigned)round(y_in_out[f]);
- float ori = ori_in[f];
- unsigned size = patch_size;
-
- unsigned r = ceil(patch_size * sqrt(2.f) / 2.f);
- if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r)
- continue;
-
- // Descriptor fixed at 256 bits for now
- // Storing descriptor as a vector of 8 x 32-bit unsigned numbers
- for (unsigned i = 0; i < 8; i++) {
- unsigned v = 0;
-
- // j < 32 for 256 bits descriptor
- for (unsigned j = 0; j < 32; j++) {
- // Get position from distribution pattern and values of points p1 and p2
- int dist_x = ref_pat[i*32*4 + j*4];
- int dist_y = ref_pat[i*32*4 + j*4+1];
- T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
-
- dist_x = ref_pat[i*32*4 + j*4+2];
- dist_y = ref_pat[i*32*4 + j*4+3];
- T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
-
- // Calculate bit based on p1 and p2 and shifts it to correct position
- v |= (p1 < p2) << j;
- }
-
- // Store 32 bits of descriptor
- desc_out[f * 8 + i] += v;
- }
-
- x_in_out[f] = round(x * scl);
- y_in_out[f] = round(y * scl);
- size_out[f] = patch_size * scl;
- }
-}
-
-
-
template<typename T, typename convAccT>
unsigned orb(Array<float> &x, Array<float> &y,
Array<float> &score, Array<float> &ori,
@@ -652,7 +144,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
// Calculate Harris responses
// Good block_size >= 7 (must be an odd number)
unsigned usable_feat = 0;
- harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
+ kernel::harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
h_x_feat, h_y_feat, nullptr,
lvl_feat, &usable_feat,
lvl_img,
@@ -689,7 +181,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
float* h_score_lvl = memAlloc<float>(usable_feat);
// Keep only features with higher Harris responses
- keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
+ kernel::keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
h_x_harris, h_y_harris, harris_sorted.get(), harris_idx.get(),
nullptr, usable_feat);
@@ -700,7 +192,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
float* h_size_lvl = memAlloc<float>(usable_feat);
// Compute orientation of features
- centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
+ kernel::centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
lvl_img, patch_size);
Array<T> lvl_filt = createEmptyArray<T>(dim4());
@@ -723,11 +215,11 @@ unsigned orb(Array<float> &x, Array<float> &y,
unsigned* h_desc_lvl = memAlloc<unsigned>(usable_feat * 8);
memset(h_desc_lvl, 0, usable_feat * 8 * sizeof(unsigned));
if (blur_img)
- extract_orb<T>(h_desc_lvl, usable_feat,
+ kernel::extract_orb<T>(h_desc_lvl, usable_feat,
h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
lvl_filt, lvl_scl, patch_size);
else
- extract_orb<T>(h_desc_lvl, usable_feat,
+ kernel::extract_orb<T>(h_desc_lvl, usable_feat,
h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
lvl_img, lvl_scl, patch_size);
diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp
index 8c83ad6..55cf295 100644
--- a/src/backend/cpu/random.cpp
+++ b/src/backend/cpu/random.cpp
@@ -7,12 +7,6 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
-#include <type_traits>
-#include <random>
-#include <algorithm>
-#include <functional>
-#include <limits>
-#include <type_traits>
#include <af/array.h>
#include <af/dim4.hpp>
#include <af/defines.h>
@@ -20,140 +14,16 @@
#include <random.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/random.hpp>
namespace cpu
{
-using namespace std;
-
-template<typename T>
-using is_arithmetic_t = typename enable_if< is_arithmetic<T>::value, function<T()>>::type;
-template<typename T>
-using is_complex_t = typename enable_if< is_complex<T>::value, function<T()>>::type;
-template<typename T>
-using is_floating_point_t = typename enable_if< is_floating_point<T>::value, function<T()>>::type;
-
-template<typename T, typename GenType>
-is_arithmetic_t<T>
-urand(GenType &generator)
-{
- typedef typename conditional< is_floating_point<T>::value,
- uniform_real_distribution<T>,
-#if OS_WIN
- uniform_int_distribution<unsigned>>::type dist;
-#else
- uniform_int_distribution<T >> ::type dist;
-#endif
- return bind(dist(), generator);
-}
-
-template<typename T, typename GenType>
-is_complex_t<T>
-urand(GenType &generator)
-{
- auto func = urand<typename T::value_type>(generator);
- return [func] () { return T(func(), func());};
-}
-
-template<typename T, typename GenType>
-is_floating_point_t<T>
-nrand(GenType &generator)
-{
- return bind(normal_distribution<T>(), generator);
-}
-
-template<typename T, typename GenType>
-is_complex_t<T>
-nrand(GenType &generator)
-{
- auto func = nrand<typename T::value_type>(generator);
- return [func] () { return T(func(), func());};
-}
-
-static default_random_engine generator;
-static unsigned long long gen_seed = 0;
-static bool is_first = true;
-#define GLOBAL 1
-
-template<typename T>
-void randn_(Array<T> out)
-{
- static unsigned long long my_seed = 0;
- if (is_first) {
- setSeed(gen_seed);
- my_seed = gen_seed;
- }
-
- static auto gen = nrand<T>(generator);
-
- if (my_seed != gen_seed) {
- gen = nrand<T>(generator);
- my_seed = gen_seed;
- }
-
- T *outPtr = out.get();
- for (int i = 0; i < (int)out.elements(); i++) {
- outPtr[i] = gen();
- }
-}
-
-template<typename T>
-Array<T> randn(const af::dim4 &dims)
-{
- Array<T> outArray = createEmptyArray<T>(dims);
- getQueue().enqueue(randn_<T>, outArray);
- return outArray;
-}
-
-template<typename T>
-void randu_(Array<T> out)
-{
- static unsigned long long my_seed = 0;
- if (is_first) {
- setSeed(gen_seed);
- my_seed = gen_seed;
- }
-
- static auto gen = urand<T>(generator);
-
- if (my_seed != gen_seed) {
- gen = urand<T>(generator);
- my_seed = gen_seed;
- }
-
- T *outPtr = out.get();
- for (int i = 0; i < (int)out.elements(); i++) {
- outPtr[i] = gen();
- }
-}
-
-template<>
-void randu_(Array<char> out)
-{
- static unsigned long long my_seed = 0;
- if (is_first) {
- setSeed(gen_seed);
- my_seed = gen_seed;
- }
-
- static auto gen = urand<float>(generator);
-
- if (my_seed != gen_seed) {
- gen = urand<float>(generator);
- my_seed = gen_seed;
- }
-
- char *outPtr = out.get();
- for (int i = 0; i < (int)out.elements(); i++) {
- outPtr[i] = gen() > 0.5;
- }
-}
-
template<typename T>
Array<T> randu(const af::dim4 &dims)
{
Array<T> outArray = createEmptyArray<T>(dims);
- getQueue().enqueue(randu_<T>, outArray);
+ getQueue().enqueue(kernel::randu<T>, outArray);
return outArray;
}
@@ -172,6 +42,14 @@ INSTANTIATE_UNIFORM(uchar)
INSTANTIATE_UNIFORM(short)
INSTANTIATE_UNIFORM(ushort)
+template<typename T>
+Array<T> randn(const af::dim4 &dims)
+{
+ Array<T> outArray = createEmptyArray<T>(dims);
+ getQueue().enqueue(kernel::randn<T>, outArray);
+ return outArray;
+}
+
#define INSTANTIATE_NORMAL(T) \
template Array<T> randn<T>(const af::dim4 &dims);
@@ -184,32 +62,36 @@ template<>
Array<char> randu(const af::dim4 &dims)
{
static unsigned long long my_seed = 0;
- if (is_first) {
- setSeed(gen_seed);
- my_seed = gen_seed;
+ if (kernel::is_first) {
+ setSeed(kernel::gen_seed);
+ my_seed = kernel::gen_seed;
}
- static auto gen = urand<float>(generator);
+ static auto gen = kernel::urand<float>(kernel::generator);
- if (my_seed != gen_seed) {
- gen = urand<float>(generator);
- my_seed = gen_seed;
+ if (my_seed != kernel::gen_seed) {
+ gen = kernel::urand<float>(kernel::generator);
+ my_seed = kernel::gen_seed;
}
Array<char> outArray = createEmptyArray<char>(dims);
- char *outPtr = outArray.get();
- for (int i = 0; i < (int)outArray.elements(); i++) {
- outPtr[i] = gen() > 0.5;
- }
+ auto func = [=](Array<char> outArray) {
+ char *outPtr = outArray.get();
+ for (int i = 0; i < (int)outArray.elements(); i++) {
+ outPtr[i] = gen() > 0.5;
+ }
+ };
+ getQueue().enqueue(func, outArray);
+
return outArray;
}
void setSeed(const uintl seed)
{
auto f = [=](const uintl seed){
- generator.seed(seed);
- is_first = false;
- gen_seed = seed;
+ kernel::generator.seed(seed);
+ kernel::is_first = false;
+ kernel::gen_seed = seed;
};
getQueue().enqueue(f, seed);
}
@@ -217,7 +99,7 @@ void setSeed(const uintl seed)
uintl getSeed()
{
getQueue().sync();
- return gen_seed;
+ return kernel::gen_seed;
}
}
diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp
index 7837db5..b5ba5f8 100644
--- a/src/backend/cpu/range.cpp
+++ b/src/backend/cpu/range.cpp
@@ -16,47 +16,11 @@
#include <numeric>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/range.hpp>
namespace cpu
{
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename T, int dim>
-void range(Array<T> output)
-{
- T* out = output.get();
-
- const dim4 dims = output.dims();
- const dim4 strides = output.strides();
-
- for(dim_t w = 0; w < dims[3]; w++) {
- dim_t offW = w * strides[3];
- for(dim_t z = 0; z < dims[2]; z++) {
- dim_t offWZ = offW + z * strides[2];
- for(dim_t y = 0; y < dims[1]; y++) {
- dim_t offWZY = offWZ + y * strides[1];
- for(dim_t x = 0; x < dims[0]; x++) {
- dim_t id = offWZY + x;
- if(dim == 0) {
- out[id] = x;
- } else if(dim == 1) {
- out[id] = y;
- } else if(dim == 2) {
- out[id] = z;
- } else if(dim == 3) {
- out[id] = w;
- }
- }
- }
- }
- }
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
template<typename T>
Array<T> range(const dim4& dims, const int seq_dim)
{
@@ -69,10 +33,10 @@ Array<T> range(const dim4& dims, const int seq_dim)
Array<T> out = createEmptyArray<T>(dims);
switch(_seq_dim) {
- case 0: getQueue().enqueue(range<T, 0>, out); break;
- case 1: getQueue().enqueue(range<T, 1>, out); break;
- case 2: getQueue().enqueue(range<T, 2>, out); break;
- case 3: getQueue().enqueue(range<T, 3>, out); break;
+ case 0: getQueue().enqueue(kernel::range<T, 0>, out); break;
+ case 1: getQueue().enqueue(kernel::range<T, 1>, out); break;
+ case 2: getQueue().enqueue(kernel::range<T, 2>, out); break;
+ case 3: getQueue().enqueue(kernel::range<T, 3>, out); break;
default : AF_ERROR("Invalid rep selection", AF_ERR_ARG);
}
diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp
index cce1226..cd44b5e 100644
--- a/src/backend/cpu/reduce.cpp
+++ b/src/backend/cpu/reduce.cpp
@@ -15,9 +15,9 @@
#include <ops.hpp>
#include <functional>
#include <complex>
-
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/reduce.hpp>
using af::dim4;
@@ -38,56 +38,6 @@ struct Binary<cdouble, af_add_t>
namespace cpu
{
-template<af_op_t op, typename Ti, typename To, int D>
-struct reduce_dim
-{
- void operator()(Array<To> out, const dim_t outOffset,
- const Array<Ti> in, const dim_t inOffset,
- const int dim, bool change_nan, double nanval)
- {
- static const int D1 = D - 1;
- static reduce_dim<op, Ti, To, D1> reduce_dim_next;
-
- const dim4 ostrides = out.strides();
- const dim4 istrides = in.strides();
- const dim4 odims = out.dims();
-
- for (dim_t i = 0; i < odims[D1]; i++) {
- reduce_dim_next(out, outOffset + i * ostrides[D1],
- in, inOffset + i * istrides[D1],
- dim, change_nan, nanval);
- }
- }
-};
-
-template<af_op_t op, typename Ti, typename To>
-struct reduce_dim<op, Ti, To, 0>
-{
-
- Transform<Ti, To, op> transform;
- Binary<To, op> reduce;
- void operator()(Array<To> out, const dim_t outOffset,
- const Array<Ti> in, const dim_t inOffset,
- const int dim, bool change_nan, double nanval)
- {
- const dim4 istrides = in.strides();
- const dim4 idims = in.dims();
-
- To * const outPtr = out.get() + outOffset;
- Ti const * const inPtr = in.get() + inOffset;
- dim_t stride = istrides[dim];
-
- To out_val = reduce.init();
- for (dim_t i = 0; i < idims[dim]; i++) {
- To in_val = transform(inPtr[i * stride]);
- if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
- out_val = reduce(in_val, out_val);
- }
-
- *outPtr = out_val;
- }
-};
-
template<af_op_t op, typename Ti, typename To>
using reduce_dim_func = std::function<void(Array<To>, const dim_t,
const Array<Ti>, const dim_t,
@@ -101,10 +51,10 @@ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nan
in.eval();
Array<To> out = createEmptyArray<To>(odims);
- static const reduce_dim_func<op, Ti, To> reduce_funcs[4] = { reduce_dim<op, Ti, To, 1>()
- , reduce_dim<op, Ti, To, 2>()
- , reduce_dim<op, Ti, To, 3>()
- , reduce_dim<op, Ti, To, 4>()};
+ static const reduce_dim_func<op, Ti, To> reduce_funcs[4] = { kernel::reduce_dim<op, Ti, To, 1>()
+ , kernel::reduce_dim<op, Ti, To, 2>()
+ , kernel::reduce_dim<op, Ti, To, 3>()
+ , kernel::reduce_dim<op, Ti, To, 4>()};
getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval);
diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp
index f7309c8..ffac11c 100644
--- a/src/backend/cpu/regions.cpp
+++ b/src/backend/cpu/regions.cpp
@@ -19,6 +19,7 @@
#include <algorithm>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/regions.hpp>
using af::dim4;
@@ -26,186 +27,14 @@ namespace cpu
{
template<typename T>
-class LabelNode
-{
-private:
- T label;
- T minLabel;
- unsigned rank;
- LabelNode* parent;
-
-public:
- LabelNode() : label(0), minLabel(0), rank(0), parent(this) { }
- LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { }
-
- T getLabel()
- {
- return label;
- }
-
- T getMinLabel()
- {
- return minLabel;
- }
-
- LabelNode* getParent()
- {
- return parent;
- }
-
- unsigned getRank()
- {
- return rank;
- }
-
- void setMinLabel(T l)
- {
- minLabel = l;
- }
-
- void setParent(LabelNode* p)
- {
- parent = p;
- }
-
- void setRank(unsigned r)
- {
- rank = r;
- }
-};
-
-template<typename T>
-static LabelNode<T>* find(LabelNode<T>* x)
-{
- if (x->getParent() != x)
- x->setParent(find(x->getParent()));
- return x->getParent();
-}
-
-template<typename T>
-static void setUnion(LabelNode<T>* x, LabelNode<T>* y)
-{
- LabelNode<T>* xRoot = find(x);
- LabelNode<T>* yRoot = find(y);
- if (xRoot == yRoot)
- return;
-
- T xMinLabel = xRoot->getMinLabel();
- T yMinLabel = yRoot->getMinLabel();
- xRoot->setMinLabel(min(xMinLabel, yMinLabel));
- yRoot->setMinLabel(min(xMinLabel, yMinLabel));
-
- if (xRoot->getRank() < yRoot->getRank())
- xRoot->setParent(yRoot);
- else if (xRoot->getRank() > yRoot->getRank())
- yRoot->setParent(xRoot);
- else {
- yRoot->setParent(xRoot);
- xRoot->setRank(xRoot->getRank() + 1);
- }
-}
-
-template<typename T>
Array<T> regions(const Array<char> &in, af_connectivity connectivity)
{
in.eval();
- // Create output placeholder
Array<T> out = createValueArray(in.dims(), (T)0);
out.eval();
- auto func = [=] (Array<T> out, const Array<char> in, af_connectivity connectivity) {
- const dim4 in_dims = in.dims();
- const char *in_ptr = in.get();
- T *out_ptr = out.get();
-
- // Map labels
- typedef typename std::map<T, LabelNode<T>* > label_map_t;
- typedef typename label_map_t::iterator label_map_iterator_t;
-
- label_map_t lmap;
-
- // Initial label
- T label = (T)1;
-
- for (int j = 0; j < (int)in_dims[1]; j++) {
- for (int i = 0; i < (int)in_dims[0]; i++) {
- int idx = j * in_dims[0] + i;
- if (in_ptr[idx] != 0) {
- std::vector<T> l;
-
- // Test neighbors
- if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0)
- l.push_back(out_ptr[j * in_dims[0] + i-1]);
- if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0)
- l.push_back(out_ptr[(j-1) * in_dims[0] + i]);
- if (connectivity == AF_CONNECTIVITY_8 && i > 0 &&
- j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0)
- l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]);
- if (connectivity == AF_CONNECTIVITY_8 &&
- i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0)
- l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]);
-
- if (!l.empty()) {
- T minl = l[0];
- for (size_t k = 0; k < l.size(); k++) {
- minl = min(l[k], minl);
- label_map_iterator_t cur_map = lmap.find(l[k]);
- LabelNode<T> *node = cur_map->second;
- // Group labels of the same region under a disjoint set
- for (size_t m = k+1; m < l.size(); m++)
- setUnion(node, lmap.find(l[m])->second);
- }
- // Set label to smallest neighbor label
- out_ptr[idx] = minl;
- }
- else {
- // Insert new label in map
- LabelNode<T> *node = new LabelNode<T>(label);
- lmap.insert(std::pair<T, LabelNode<T>* >(label, node));
- out_ptr[idx] = label++;
- }
- }
- }
- }
-
- std::set<T> removed;
-
- for (int j = 0; j < (int)in_dims[1]; j++) {
- for (int i = 0; i < (int)in_dims[0]; i++) {
- int idx = j * (int)in_dims[0] + i;
- if (in_ptr[idx] != 0) {
- T l = out_ptr[idx];
- label_map_iterator_t cur_map = lmap.find(l);
-
- if (cur_map != lmap.end()) {
- LabelNode<T>* node = cur_map->second;
-
- LabelNode<T>* node_root = find(node);
- out_ptr[idx] = node_root->getMinLabel();
-
- // Mark removed labels (those that are part of a region
- // that contains a smaller label)
- if (node->getMinLabel() < l || node_root->getMinLabel() < l)
- removed.insert(l);
- if (node->getLabel() > node->getMinLabel())
- removed.insert(node->getLabel());
- }
- }
- }
- }
-
- // Calculate final neighbors (ensure final labels are sequential)
- for (int j = 0; j < (int)in_dims[1]; j++) {
- for (int i = 0; i < (int)in_dims[0]; i++) {
- int idx = j * (int)in_dims[0] + i;
- if (out_ptr[idx] > 0) {
- out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx]));
- }
- }
- }
- };
- getQueue().enqueue(func, out, in, connectivity);
+ getQueue().enqueue(kernel::regions<T>, out, in, connectivity);
return out;
}
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp
index 1ad7dad..162039b 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/reorder.cpp
@@ -9,49 +9,14 @@
#include <Array.hpp>
#include <reorder.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/reorder.hpp>
namespace cpu
{
template<typename T>
-void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
-{
- T* outPtr = out.get();
- const T* inPtr = in.get();
-
- const af::dim4 ist = in.strides();
- const af::dim4 ost = out.strides();
-
-
- dim_t ids[4] = {0};
- for(dim_t ow = 0; ow < oDims[3]; ow++) {
- const dim_t oW = ow * ost[3];
- ids[rdims[3]] = ow;
- for(dim_t oz = 0; oz < oDims[2]; oz++) {
- const dim_t oZW = oW + oz * ost[2];
- ids[rdims[2]] = oz;
- for(dim_t oy = 0; oy < oDims[1]; oy++) {
- const dim_t oYZW = oZW + oy * ost[1];
- ids[rdims[1]] = oy;
- for(dim_t ox = 0; ox < oDims[0]; ox++) {
- const dim_t oIdx = oYZW + ox;
-
- ids[rdims[0]] = ox;
- const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
- ids[1] * ist[1] + ids[0];
-
- outPtr[oIdx] = inPtr[iIdx];
- }
- }
- }
- }
-}
-
-template<typename T>
Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
{
in.eval();
@@ -62,7 +27,7 @@ Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
oDims[i] = iDims[rdims[i]];
Array<T> out = createEmptyArray<T>(oDims);
- getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
+ getQueue().enqueue(kernel::reorder<T>, out, in, oDims, rdims);
return out;
}
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp
index 8fb2edc..9a5c85b 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/resize.cpp
@@ -9,174 +9,16 @@
#include <Array.hpp>
#include <resize.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <math.hpp>
#include <types.hpp>
#include <af/traits.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/resize.hpp>
namespace cpu
{
-/**
- * noop function for round to avoid compilation
- * issues due to lack of this function in C90 based
- * compilers, it is only present in C99 and C++11
- *
- * This is not a full fledged implementation, this function
- * is to be used only for positive numbers, i m using it here
- * for calculating dimensions of arrays
- */
-dim_t round2int(float value)
-{
- return (dim_t)(value+0.5f);
-}
-
-using std::conditional;
-using std::is_same;
-
-template<typename T>
-using wtype_t = typename conditional<is_same<T, double>::value, double, float>::type;
-
-template<typename T>
-using vtype_t = typename conditional<is_complex<T>::value,
- T, wtype_t<T>
- >::type;
-
-template<typename T, af_interp_type method>
-struct resize_op
-{
- void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const dim_t x, const dim_t y)
- {
- return;
- }
-};
-
-template<typename T>
-struct resize_op<T, AF_INTERP_NEAREST>
-{
- void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const dim_t x, const dim_t y)
- {
- // Compute Indices
- dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0]));
- dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1]));
-
- if (i_x >= idims[0]) i_x = idims[0] - 1;
- if (i_y >= idims[1]) i_y = idims[1] - 1;
-
- dim_t i_off = i_y * istrides[1] + i_x;
- dim_t o_off = y * ostrides[1] + x;
- // Copy values from all channels
- for(dim_t w = 0; w < odims[3]; w++) {
- dim_t wost = w * ostrides[3];
- dim_t wist = w * istrides[3];
- for(dim_t z = 0; z < odims[2]; z++) {
- outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
- }
- }
- }
-};
-
-template<typename T>
-struct resize_op<T, AF_INTERP_BILINEAR>
-{
- void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const dim_t x, const dim_t y)
- {
- // Compute Indices
- float f_x = (float)x / (odims[0] / (float)idims[0]);
- float f_y = (float)y / (odims[1] / (float)idims[1]);
-
- dim_t i1_x = floor(f_x);
- dim_t i1_y = floor(f_y);
-
- if (i1_x >= idims[0]) i1_x = idims[0] - 1;
- if (i1_y >= idims[1]) i1_y = idims[1] - 1;
-
- float b = f_x - i1_x;
- float a = f_y - i1_y;
-
- dim_t i2_x = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1);
- dim_t i2_y = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1);
-
- typedef typename dtype_traits<T>::base_type BT;
- typedef wtype_t<BT> WT;
- typedef vtype_t<T> VT;
-
- dim_t o_off = y * ostrides[1] + x;
- // Copy values from all channels
- for(dim_t w = 0; w < odims[3]; w++) {
- dim_t wst = w * istrides[3];
- for(dim_t z = 0; z < odims[2]; z++) {
- dim_t zst = z * istrides[2];
- dim_t channel_off = zst + wst;
- VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off];
- VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off];
- VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off];
- VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off];
-
- outPtr[o_off + z * ostrides[2] + w * ostrides[3]] =
- scalar<WT>((1.0f - a) * (1.0f - b)) * p1 +
- scalar<WT>(( a ) * (1.0f - b)) * p2 +
- scalar<WT>((1.0f - a) * ( b )) * p3 +
- scalar<WT>(( a ) * ( b )) * p4;
- }
- }
- }
-};
-
-template<typename T>
-struct resize_op<T, AF_INTERP_LOWER>
-{
- void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
- const af::dim4 &ostrides, const af::dim4 &istrides,
- const dim_t x, const dim_t y)
- {
- // Compute Indices
- dim_t i_x = floor((float)x / (odims[0] / (float)idims[0]));
- dim_t i_y = floor((float)y / (odims[1] / (float)idims[1]));
-
- if (i_x >= idims[0]) i_x = idims[0] - 1;
- if (i_y >= idims[1]) i_y = idims[1] - 1;
-
- dim_t i_off = i_y * istrides[1] + i_x;
- dim_t o_off = y * ostrides[1] + x;
- // Copy values from all channels
- for(dim_t w = 0; w < odims[3]; w++) {
- dim_t wost = w * ostrides[3];
- dim_t wist = w * istrides[3];
- for(dim_t z = 0; z < odims[2]; z++) {
- outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
- }
- }
- }
-};
-
-template<typename T, af_interp_type method>
-void resize_(Array<T> out, const Array<T> in)
-{
- af::dim4 idims = in.dims();
- af::dim4 odims = out.dims();
- const T *inPtr = in.get();
- T *outPtr = out.get();
- af::dim4 ostrides = out.strides();
- af::dim4 istrides = in.strides();
-
- resize_op<T, method> op;
- for(dim_t y = 0; y < odims[1]; y++) {
- for(dim_t x = 0; x < odims[0]; x++) {
- op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y);
- }
- }
-}
-
template<typename T>
Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
const af_interp_type method)
@@ -190,11 +32,11 @@ Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
switch(method) {
case AF_INTERP_NEAREST:
- getQueue().enqueue(resize_<T, AF_INTERP_NEAREST>, out, in); break;
+ getQueue().enqueue(kernel::resize<T, AF_INTERP_NEAREST>, out, in); break;
case AF_INTERP_BILINEAR:
- getQueue().enqueue(resize_<T, AF_INTERP_BILINEAR>, out, in); break;
+ getQueue().enqueue(kernel::resize<T, AF_INTERP_BILINEAR>, out, in); break;
case AF_INTERP_LOWER:
- getQueue().enqueue(resize_<T, AF_INTERP_LOWER>, out, in); break;
+ getQueue().enqueue(kernel::resize<T, AF_INTERP_LOWER>, out, in); break;
default: break;
}
return out;
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp
index 5687d69..e81ee04 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/rotate.cpp
@@ -9,77 +9,14 @@
#include <Array.hpp>
#include <rotate.hpp>
-#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
#include "transform_interp.hpp"
+#include <kernel/rotate.hpp>
namespace cpu
{
-template<typename T, af_interp_type method>
-void rotate_(Array<T> output, const Array<T> input, const float theta)
-{
- const af::dim4 odims = output.dims();
- const af::dim4 idims = input.dims();
- const af::dim4 ostrides = output.strides();
- const af::dim4 istrides = input.strides();
-
- const T* in = input.get();
- T* out = output.get();
- dim_t nimages = idims[2];
-
- void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
- const af::dim4 &, const af::dim4 &,
- const dim_t, const dim_t, const dim_t, const dim_t);
-
- const float c = cos(-theta), s = sin(-theta);
- float tx, ty;
- {
- const float nx = 0.5 * (idims[0] - 1);
- const float ny = 0.5 * (idims[1] - 1);
- const float mx = 0.5 * (odims[0] - 1);
- const float my = 0.5 * (odims[1] - 1);
- const float sx = (mx * c + my *-s);
- const float sy = (mx * s + my * c);
- tx = -(sx - nx);
- ty = -(sy - ny);
- }
-
- const float tmat[6] = {std::round( c * 1000) / 1000.0f,
- std::round(-s * 1000) / 1000.0f,
- std::round(tx * 1000) / 1000.0f,
- std::round( s * 1000) / 1000.0f,
- std::round( c * 1000) / 1000.0f,
- std::round(ty * 1000) / 1000.0f,
- };
-
- switch(method) {
- case AF_INTERP_NEAREST:
- t_fn = &transform_n;
- break;
- case AF_INTERP_BILINEAR:
- t_fn = &transform_b;
- break;
- case AF_INTERP_LOWER:
- t_fn = &transform_l;
- break;
- default:
- AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
- break;
- }
-
-
- // Do transform for image
- for(int yy = 0; yy < (int)odims[1]; yy++) {
- for(int xx = 0; xx < (int)odims[0]; xx++) {
- t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy);
- }
- }
-}
-
template<typename T>
Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
const af_interp_type method)
@@ -90,13 +27,13 @@ Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
switch(method) {
case AF_INTERP_NEAREST:
- getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
+ getQueue().enqueue(kernel::rotate<T, AF_INTERP_NEAREST>, out, in, theta);
break;
case AF_INTERP_BILINEAR:
- getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
+ getQueue().enqueue(kernel::rotate<T, AF_INTERP_BILINEAR>, out, in, theta);
break;
case AF_INTERP_LOWER:
- getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
+ getQueue().enqueue(kernel::rotate<T, AF_INTERP_LOWER>, out, in, theta);
break;
default:
AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp
index 39157ca..615744f 100644
--- a/src/backend/cpu/scan.cpp
+++ b/src/backend/cpu/scan.cpp
@@ -16,64 +16,13 @@
#include <ops.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/scan.hpp>
using af::dim4;
namespace cpu
{
-template<af_op_t op, typename Ti, typename To, int D>
-struct scan_dim
-{
- void operator()(Array<To> out, dim_t outOffset,
- const Array<Ti> in, dim_t inOffset,
- const int dim) const
- {
- const dim4 odims = out.dims();
- const dim4 ostrides = out.strides();
- const dim4 istrides = in.strides();
-
- const int D1 = D - 1;
- for (dim_t i = 0; i < odims[D1]; i++) {
- scan_dim<op, Ti, To, D1> func;
- getQueue().enqueue(func,
- out, outOffset + i * ostrides[D1],
- in, inOffset + i * istrides[D1], dim);
- if (D1 == dim) break;
- }
- }
-};
-
-template<af_op_t op, typename Ti, typename To>
-struct scan_dim<op, Ti, To, 0>
-{
- void operator()(Array<To> output, dim_t outOffset,
- const Array<Ti> input, dim_t inOffset,
- const int dim) const
- {
- const Ti* in = input.get() + inOffset;
- To* out= output.get()+ outOffset;
-
- const dim4 ostrides = output.strides();
- const dim4 istrides = input.strides();
- const dim4 idims = input.dims();
-
- dim_t istride = istrides[dim];
- dim_t ostride = ostrides[dim];
-
- Transform<Ti, To, op> transform;
- // FIXME: Change the name to something better
- Binary<To, op> scan;
-
- To out_val = scan.init();
- for (dim_t i = 0; i < idims[dim]; i++) {
- To in_val = transform(in[i * istride]);
- out_val = scan(in_val, out_val);
- out[i * ostride] = out_val;
- }
- }
-};
-
template<af_op_t op, typename Ti, typename To>
Array<To> scan(const Array<Ti>& in, const int dim)
{
@@ -84,19 +33,19 @@ Array<To> scan(const Array<Ti>& in, const int dim)
switch (in.ndims()) {
case 1:
- scan_dim<op, Ti, To, 1> func1;
+ kernel::scan_dim<op, Ti, To, 1> func1;
getQueue().enqueue(func1, out, 0, in, 0, dim);
break;
case 2:
- scan_dim<op, Ti, To, 2> func2;
+ kernel::scan_dim<op, Ti, To, 2> func2;
getQueue().enqueue(func2, out, 0, in, 0, dim);
break;
case 3:
- scan_dim<op, Ti, To, 3> func3;
+ kernel::scan_dim<op, Ti, To, 3> func3;
getQueue().enqueue(func3, out, 0, in, 0, dim);
break;
case 4:
- scan_dim<op, Ti, To, 4> func4;
+ kernel::scan_dim<op, Ti, To, 4> func4;
getQueue().enqueue(func4, out, 0, in, 0, dim);
break;
}
diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp
index 4a219ed..d9a6795 100644
--- a/src/backend/cpu/select.cpp
+++ b/src/backend/cpu/select.cpp
@@ -6,12 +6,13 @@
* The complete license agreement can be obtained at:
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+
#include <ArrayInfo.hpp>
#include <Array.hpp>
#include <select.hpp>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/select.hpp>
using af::dim4;
@@ -25,66 +26,7 @@ void select(Array<T> &out, const Array<char> &cond, const Array<T> &a, const Arr
cond.eval();
a.eval();
b.eval();
- auto func = [=] (Array<T> out, const Array<char> cond, const Array<T> a, const Array<T> b) {
- dim4 adims = a.dims();
- dim4 astrides = a.strides();
- dim4 bdims = b.dims();
- dim4 bstrides = b.strides();
-
- dim4 cdims = cond.dims();
- dim4 cstrides = cond.strides();
-
- dim4 odims = out.dims();
- dim4 ostrides = out.strides();
-
- bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1],
- adims[2] == odims[2], adims[3] == odims[3]};
-
- bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1],
- bdims[2] == odims[2], bdims[3] == odims[3]};
-
- bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1],
- cdims[2] == odims[2], cdims[3] == odims[3]};
-
- const T *aptr = a.get();
- const T *bptr = b.get();
- T *optr = out.get();
- const char *cptr = cond.get();
-
- for (int l = 0; l < odims[3]; l++) {
-
- int o_off3 = ostrides[3] * l;
- int a_off3 = astrides[3] * is_a_same[3] * l;
- int b_off3 = bstrides[3] * is_b_same[3] * l;
- int c_off3 = cstrides[3] * is_c_same[3] * l;
-
- for (int k = 0; k < odims[2]; k++) {
-
- int o_off2 = ostrides[2] * k + o_off3;
- int a_off2 = astrides[2] * is_a_same[2] * k + a_off3;
- int b_off2 = bstrides[2] * is_b_same[2] * k + b_off3;
- int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3;
-
- for (int j = 0; j < odims[1]; j++) {
-
- int o_off1 = ostrides[1] * j + o_off2;
- int a_off1 = astrides[1] * is_a_same[1] * j + a_off2;
- int b_off1 = bstrides[1] * is_b_same[1] * j + b_off2;
- int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2;
-
- for (int i = 0; i < odims[0]; i++) {
-
- bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1];
- T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1];
- T bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1];
- T oval = cval ? aval : bval;
- optr[o_off1 + i] = oval;
- }
- }
- }
- }
- };
- getQueue().enqueue(func, out, cond, a, b);
+ getQueue().enqueue(kernel::select<T>, out, cond, a, b);
}
template<typename T, bool flip>
@@ -93,44 +35,7 @@ void select_scalar(Array<T> &out, const Array<char> &cond, const Array<T> &a, co
out.eval();
cond.eval();
a.eval();
- auto func = [=] (Array<T> out, const Array<char> cond, const Array<T> a, const double b) {
- dim4 astrides = a.strides();
- dim4 cstrides = cond.strides();
-
- dim4 odims = out.dims();
- dim4 ostrides = out.strides();
-
- const T *aptr = a.get();
- T *optr = out.get();
- const char *cptr = cond.get();
-
- for (int l = 0; l < odims[3]; l++) {
-
- int o_off3 = ostrides[3] * l;
- int a_off3 = astrides[3] * l;
- int c_off3 = cstrides[3] * l;
-
- for (int k = 0; k < odims[2]; k++) {
-
- int o_off2 = ostrides[2] * k + o_off3;
- int a_off2 = astrides[2] * k + a_off3;
- int c_off2 = cstrides[2] * k + c_off3;
-
- for (int j = 0; j < odims[1]; j++) {
-
- int o_off1 = ostrides[1] * j + o_off2;
- int a_off1 = astrides[1] * j + a_off2;
- int c_off1 = cstrides[1] * j + c_off2;
-
- for (int i = 0; i < odims[0]; i++) {
-
- optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b;
- }
- }
- }
- }
- };
- getQueue().enqueue(func, out, cond, a, b);
+ getQueue().enqueue(kernel::select_scalar<T, flip>, out, cond, a, b);
}
#define INSTANTIATE(T) \
diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp
index 766427b..eca1e50 100644
--- a/src/backend/cpu/shift.cpp
+++ b/src/backend/cpu/shift.cpp
@@ -9,20 +9,13 @@
#include <Array.hpp>
#include <shift.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <cassert>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/shift.hpp>
namespace cpu
{
-static inline dim_t simple_mod(const dim_t i, const dim_t dim)
-{
- return (i < dim) ? i : (i - dim);
-}
-
template<typename T>
Array<T> shift(const Array<T> &in, const int sdims[4])
{
@@ -31,48 +24,7 @@ Array<T> shift(const Array<T> &in, const int sdims[4])
Array<T> out = createEmptyArray<T>(in.dims());
const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]);
- auto func = [=] (Array<T> out, const Array<T> in, const af::dim4 sdims) {
-
- T* outPtr = out.get();
- const T* inPtr = in.get();
-
- const af::dim4 oDims = out.dims();
- const af::dim4 ist = in.strides();
- const af::dim4 ost = out.strides();
-
- int sdims_[4];
- // Need to do this because we are mapping output to input in the kernel
- for(int i = 0; i < 4; i++) {
- // sdims_[i] will always be positive and always [0, oDims[i]].
- // Negative shifts are converted to position by going the other way round
- sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0);
- assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]);
- }
-
- for(dim_t ow = 0; ow < oDims[3]; ow++) {
- const int oW = ow * ost[3];
- const int iw = simple_mod((ow + sdims_[3]), oDims[3]);
- const int iW = iw * ist[3];
- for(dim_t oz = 0; oz < oDims[2]; oz++) {
- const int oZW = oW + oz * ost[2];
- const int iz = simple_mod((oz + sdims_[2]), oDims[2]);
- const int iZW = iW + iz * ist[2];
- for(dim_t oy = 0; oy < oDims[1]; oy++) {
- const int oYZW = oZW + oy * ost[1];
- const int iy = simple_mod((oy + sdims_[1]), oDims[1]);
- const int iYZW = iZW + iy * ist[1];
- for(dim_t ox = 0; ox < oDims[0]; ox++) {
- const int oIdx = oYZW + ox;
- const int ix = simple_mod((ox + sdims_[0]), oDims[0]);
- const int iIdx = iYZW + ix;
-
- outPtr[oIdx] = inPtr[iIdx];
- }
- }
- }
- }
- };
- getQueue().enqueue(func, out, in, temp);
+ getQueue().enqueue(kernel::shift<T>, out, in, temp);
return out;
}
diff --git a/src/backend/cpu/sift.cpp b/src/backend/cpu/sift.cpp
index 70bb11d..4b20f8a 100644
--- a/src/backend/cpu/sift.cpp
+++ b/src/backend/cpu/sift.cpp
@@ -22,7 +22,7 @@
#include <vector>
#ifdef AF_BUILD_SIFT
-#include <sift_nonfree.hpp>
+#include <kernel/sift_nonfree.hpp>
#endif
using af::dim4;
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp
index ba47ba9..161266d 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/sobel.cpp
@@ -13,80 +13,15 @@
#include <Array.hpp>
#include <sobel.hpp>
#include <convolve.hpp>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/sobel.hpp>
using af::dim4;
namespace cpu
{
-template<typename Ti, typename To, bool isDX>
-void derivative(Array<To> output, const Array<Ti> input)
-{
- const dim4 dims = input.dims();
- const dim4 strides = input.strides();
- To* optr = output.get();
- const Ti* iptr = input.get();
-
- for(dim_t b3=0; b3<dims[3]; ++b3) {
- for(dim_t b2=0; b2<dims[2]; ++b2) {
-
- for(dim_t j=0; j<dims[1]; ++j) {
-
- int joff = j;
- int _joff = j-1;
- int joff_ = j+1;
- int joffset = j*strides[1];
-
- for(dim_t i=0; i<dims[0]; ++i) {
-
- To accum = To(0);
-
- int ioff = i;
- int _ioff = i-1;
- int ioff_ = i+1;
-
- To NW = (_ioff>=0 && _joff>=0) ?
- iptr[_joff*strides[1]+_ioff*strides[0]] : 0;
- To SW = (ioff_<(int)dims[0] && _joff>=0) ?
- iptr[_joff*strides[1]+ioff_*strides[0]] : 0;
- To NE = (_ioff>=0 && joff_<(int)dims[1]) ?
- iptr[joff_*strides[1]+_ioff*strides[0]] : 0;
- To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ?
- iptr[joff_*strides[1]+ioff_*strides[0]] : 0;
-
- if (isDX) {
- To W = _joff>=0 ?
- iptr[_joff*strides[1]+ioff*strides[0]] : 0;
-
- To E = joff_<(int)dims[1] ?
- iptr[joff_*strides[1]+ioff*strides[0]] : 0;
-
- accum = NW+SW - (NE+SE) + 2*(W-E);
- } else {
- To N = _ioff>=0 ?
- iptr[joff*strides[1]+_ioff*strides[0]] : 0;
-
- To S = ioff_<(int)dims[0] ?
- iptr[joff*strides[1]+ioff_*strides[0]] : 0;
-
- accum = NW+NE - (SW+SE) + 2*(N-S);
- }
-
- optr[joffset+i*strides[0]] = accum;
- }
- }
-
- optr += strides[2];
- iptr += strides[2];
- }
- optr += strides[3];
- iptr += strides[3];
- }
-}
-
template<typename Ti, typename To>
std::pair< Array<To>, Array<To> >
sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
@@ -97,8 +32,8 @@ sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
Array<To> dx = createEmptyArray<To>(img.dims());
Array<To> dy = createEmptyArray<To>(img.dims());
- getQueue().enqueue(derivative<Ti, To, true >, dx, img);
- getQueue().enqueue(derivative<Ti, To, false>, dy, img);
+ getQueue().enqueue(kernel::derivative<Ti, To, true >, dx, img);
+ getQueue().enqueue(kernel::derivative<Ti, To, false>, dy, img);
return std::make_pair(dx, dy);
}
diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index cbdb50e..6a0465c 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -11,55 +11,15 @@
#include <sort.hpp>
#include <math.hpp>
#include <copy.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <algorithm>
#include <functional>
#include <platform.hpp>
#include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
+#include <kernel/sort.hpp>
namespace cpu
{
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-
-// Based off of http://stackoverflow.com/a/12399290
-template<typename T, bool isAscending>
-void sort0(Array<T> val)
-{
- // initialize original index locations
- T *val_ptr = val.get();
-
- function<bool(T, T)> op = greater<T>();
- if(isAscending) { op = less<T>(); }
-
- T *comp_ptr = nullptr;
- for(dim_t w = 0; w < val.dims()[3]; w++) {
- dim_t valW = w * val.strides()[3];
- for(dim_t z = 0; z < val.dims()[2]; z++) {
- dim_t valWZ = valW + z * val.strides()[2];
- for(dim_t y = 0; y < val.dims()[1]; y++) {
-
- dim_t valOffset = valWZ + y * val.strides()[1];
-
- comp_ptr = val_ptr + valOffset;
- std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
- }
- }
- }
- return;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
template<typename T, bool isAscending>
Array<T> sort(const Array<T> &in, const unsigned dim)
{
@@ -67,7 +27,7 @@ Array<T> sort(const Array<T> &in, const unsigned dim)
Array<T> out = copyArray<T>(in);
switch(dim) {
- case 0: getQueue().enqueue(sort0<T, isAscending>, out); break;
+ case 0: getQueue().enqueue(kernel::sort0<T, isAscending>, out); break;
default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
}
return out;
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp
index d2ebd42..409b825 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/sort_by_key.cpp
@@ -9,92 +9,13 @@
#include <Array.hpp>
#include <sort_by_key.hpp>
-#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <algorithm>
-#include <numeric>
-#include <queue>
#include <platform.hpp>
#include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
-using std::queue;
-using std::async;
+#include <kernel/sort_by_key.hpp>
namespace cpu
{
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-
-template<typename Tk, typename Tv, bool isAscending>
-void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
- const Array<Tk> ikey, const Array<Tv> ival)
-{
- function<bool(Tk, Tk)> op = greater<Tk>();
- if(isAscending) { op = less<Tk>(); }
-
- // Get pointers and initialize original index locations
- uint *oidx_ptr = oidx.get();
- Tk *okey_ptr = okey.get();
- Tv *oval_ptr = oval.get();
- const Tk *ikey_ptr = ikey.get();
- const Tv *ival_ptr = ival.get();
-
- std::vector<uint> seq_vec(oidx.dims()[0]);
- std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
- const Tk *comp_ptr = nullptr;
- auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
- for(dim_t w = 0; w < ikey.dims()[3]; w++) {
- dim_t okeyW = w * okey.strides()[3];
- dim_t ovalW = w * oval.strides()[3];
- dim_t oidxW = w * oidx.strides()[3];
- dim_t ikeyW = w * ikey.strides()[3];
- dim_t ivalW = w * ival.strides()[3];
-
- for(dim_t z = 0; z < ikey.dims()[2]; z++) {
- dim_t okeyWZ = okeyW + z * okey.strides()[2];
- dim_t ovalWZ = ovalW + z * oval.strides()[2];
- dim_t oidxWZ = oidxW + z * oidx.strides()[2];
- dim_t ikeyWZ = ikeyW + z * ikey.strides()[2];
- dim_t ivalWZ = ivalW + z * ival.strides()[2];
-
- for(dim_t y = 0; y < ikey.dims()[1]; y++) {
-
- dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
- dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
- dim_t oidxOffset = oidxWZ + y * oidx.strides()[1];
- dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1];
- dim_t ivalOffset = ivalWZ + y * ival.strides()[1];
-
- uint *ptr = oidx_ptr + oidxOffset;
- std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
- comp_ptr = ikey_ptr + ikeyOffset;
- std::stable_sort(ptr, ptr + ikey.dims()[0], comparator);
-
- for (dim_t i = 0; i < oval.dims()[0]; ++i){
- uint sortIdx = oidx_ptr[oidxOffset + i];
- okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx];
- oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx];
- }
- }
- }
- }
-
- return;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
template<typename Tk, typename Tv, bool isAscending>
void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
@@ -108,7 +29,7 @@ void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
oidx.eval();
switch(dim) {
- case 0: getQueue().enqueue(sort0_by_key<Tk, Tv, isAscending>,
+ case 0: getQueue().enqueue(kernel::sort0_by_key<Tk, Tv, isAscending>,
okey, oval, oidx, ikey, ival); break;
default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
}
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index f941534..ed6afea 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -10,72 +10,15 @@
#include <Array.hpp>
#include <sort_index.hpp>
#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <algorithm>
#include <numeric>
#include <platform.hpp>
#include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
+#include <kernel/sort_index.hpp>
namespace cpu
{
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename T, bool isAscending>
-void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
-{
- // initialize original index locations
- uint *idx_ptr = idx.get();
- T *val_ptr = val.get();
- const T *in_ptr = in.get();
- function<bool(T, T)> op = greater<T>();
- if(isAscending) { op = less<T>(); }
-
- std::vector<uint> seq_vec(idx.dims()[0]);
- std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
- const T *comp_ptr = nullptr;
- auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
- for(dim_t w = 0; w < in.dims()[3]; w++) {
- dim_t valW = w * val.strides()[3];
- dim_t idxW = w * idx.strides()[3];
- dim_t inW = w * in.strides()[3];
- for(dim_t z = 0; z < in.dims()[2]; z++) {
- dim_t valWZ = valW + z * val.strides()[2];
- dim_t idxWZ = idxW + z * idx.strides()[2];
- dim_t inWZ = inW + z * in.strides()[2];
- for(dim_t y = 0; y < in.dims()[1]; y++) {
-
- dim_t valOffset = valWZ + y * val.strides()[1];
- dim_t idxOffset = idxWZ + y * idx.strides()[1];
- dim_t inOffset = inWZ + y * in.strides()[1];
-
- uint *ptr = idx_ptr + idxOffset;
- std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
- comp_ptr = in_ptr + inOffset;
- std::stable_sort(ptr, ptr + in.dims()[0], comparator);
-
- for (dim_t i = 0; i < val.dims()[0]; ++i){
- val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
- }
- }
- }
- }
-
- return;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
template<typename T, bool isAscending>
void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
{
@@ -84,7 +27,7 @@ void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint
val = createEmptyArray<T>(in.dims());
idx = createEmptyArray<uint>(in.dims());
switch(dim) {
- case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
+ case 0: getQueue().enqueue(kernel::sort0_index<T, isAscending>, val, idx, in); break;
default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
}
}
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index c278908..6e8d0fe 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -14,6 +14,7 @@
#include <memory>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/susan.hpp>
using af::features;
using std::shared_ptr;
@@ -22,85 +23,6 @@ namespace cpu
{
template<typename T>
-void susan_responses(Array<T> output, const Array<T> input,
- const unsigned idim0, const unsigned idim1,
- const int radius, const float t, const float g,
- const unsigned border_len)
-{
- T* resp_out = output.get();
- const T* in = input.get();
-
- const unsigned r = border_len;
- const int rSqrd = radius*radius;
-
- for (unsigned y = r; y < idim1 - r; ++y) {
- for (unsigned x = r; x < idim0 - r; ++x) {
- const unsigned idx = y * idim0 + x;
- T m_0 = in[idx];
- float nM = 0.0f;
-
- for (int i=-radius; i<=radius; ++i) {
- for (int j=-radius; j<=radius; ++j) {
- if (i*i + j*j < rSqrd) {
- int p = x + i;
- int q = y + j;
- T m = in[p + idim0 * q];
- float exp_pow = std::pow((m - m_0)/t, 6.0);
- float cM = std::exp(-exp_pow);
- nM += cM;
- }
- }
- }
-
- resp_out[idx] = nM < g ? g - nM : T(0);
- }
- }
-}
-
-template<typename T>
-void non_maximal(Array<float> xcoords, Array<float> ycoords, Array<float> response,
- shared_ptr<unsigned> counter, const unsigned idim0, const unsigned idim1,
- const Array<T> input, const unsigned border_len, const unsigned max_corners)
-{
- float* x_out = xcoords.get();
- float* y_out = ycoords.get();
- float* resp_out = response.get();
- unsigned* count = counter.get();
- const T* resp_in= input.get();
-
- // Responses on the border don't have 8-neighbors to compare, discard them
- const unsigned r = border_len + 1;
-
- for (unsigned y = r; y < idim1 - r; y++) {
- for (unsigned x = r; x < idim0 - r; x++) {
- const T v = resp_in[y * idim0 + x];
-
- // Find maximum neighborhood response
- T max_v;
- max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]);
- max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]);
- max_v = max(max_v, resp_in[(y-1) * idim0 + x ]);
- max_v = max(max_v, resp_in[(y+1) * idim0 + x ]);
- max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]);
- max_v = max(max_v, resp_in[(y) * idim0 + x+1]);
- max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]);
-
- // Stores corner to {x,y,resp}_out if it's response is maximum compared
- // to its 8-neighborhood and greater or equal minimum response
- if (v > max_v) {
- const unsigned idx = *count;
- *count += 1;
- if (idx < max_corners) {
- x_out[idx] = (float)x;
- y_out[idx] = (float)y;
- resp_out[idx] = (float)v;
- }
- }
- }
- }
-}
-
-template<typename T>
unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
const Array<T> &in,
const unsigned radius, const float diff_thr, const float geom_thr,
@@ -118,9 +40,9 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
auto corners_found= std::shared_ptr<unsigned>(memAlloc<unsigned>(1), memFree<unsigned>);
corners_found.get()[0] = 0;
- getQueue().enqueue(susan_responses<T>, response, in, idims[0], idims[1],
+ getQueue().enqueue(kernel::susan_responses<T>, response, in, idims[0], idims[1],
radius, diff_thr, geom_thr, edge);
- getQueue().enqueue(non_maximal<T>, x_corners, y_corners, resp_corners, corners_found,
+ getQueue().enqueue(kernel::non_maximal<T>, x_corners, y_corners, resp_corners, corners_found,
idims[0], idims[1], response, edge, corner_lim);
getQueue().sync();
diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp
index 4f03545..6526917 100644
--- a/src/backend/cpu/tile.cpp
+++ b/src/backend/cpu/tile.cpp
@@ -9,10 +9,9 @@
#include <Array.hpp>
#include <tile.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/tile.hpp>
namespace cpu
{
@@ -32,40 +31,7 @@ Array<T> tile(const Array<T> &in, const af::dim4 &tileDims)
Array<T> out = createEmptyArray<T>(oDims);
- auto func = [=] (Array<T> out, const Array<T> in) {
-
- T* outPtr = out.get();
- const T* inPtr = in.get();
-
- const af::dim4 iDims = in.dims();
- const af::dim4 oDims = out.dims();
- const af::dim4 ist = in.strides();
- const af::dim4 ost = out.strides();
-
- for(dim_t ow = 0; ow < oDims[3]; ow++) {
- const dim_t iw = ow % iDims[3];
- const dim_t iW = iw * ist[3];
- const dim_t oW = ow * ost[3];
- for(dim_t oz = 0; oz < oDims[2]; oz++) {
- const dim_t iz = oz % iDims[2];
- const dim_t iZW = iW + iz * ist[2];
- const dim_t oZW = oW + oz * ost[2];
- for(dim_t oy = 0; oy < oDims[1]; oy++) {
- const dim_t iy = oy % iDims[1];
- const dim_t iYZW = iZW + iy * ist[1];
- const dim_t oYZW = oZW + oy * ost[1];
- for(dim_t ox = 0; ox < oDims[0]; ox++) {
- const dim_t ix = ox % iDims[0];
- const dim_t iMem = iYZW + ix;
- const dim_t oMem = oYZW + ox;
- outPtr[oMem] = inPtr[iMem];
- }
- }
- }
- }
- };
-
- getQueue().enqueue(func, out, in);
+ getQueue().enqueue(kernel::tile<T>, out, in);
return out;
}
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp
index a7287ce..fc71458 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/transform.cpp
@@ -10,99 +10,14 @@
#include <Array.hpp>
#include <transform.hpp>
#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
#include "transform_interp.hpp"
+#include <kernel/transform.hpp>
namespace cpu
{
-template <typename T>
-void calc_affine_inverse(T *txo, const T *txi)
-{
- T det = txi[0]*txi[4] - txi[1]*txi[3];
-
- txo[0] = txi[4] / det;
- txo[1] = txi[3] / det;
- txo[3] = txi[1] / det;
- txo[4] = txi[0] / det;
-
- txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1];
- txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4];
-}
-
-template <typename T>
-void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse)
-{
- // The way kernel is structured, it expects an inverse
- // transform matrix by default.
- // If it is an forward transform, then we need its inverse
- if(inverse) {
- for(int i = 0; i < 6; i++)
- tmat[i] = tmat_ptr[i];
- } else {
- calc_affine_inverse(tmat, tmat_ptr);
- }
-}
-
-template<typename T, af_interp_type method>
-void transform_(Array<T> output, const Array<T> input,
- const Array<float> transform, const bool inverse)
-{
- const af::dim4 idims = input.dims();
- const af::dim4 odims = output.dims();
- const af::dim4 istrides = input.strides();
- const af::dim4 ostrides = output.strides();
-
- T * out = output.get();
- const T * in = input.get();
- const float* tf = transform.get();
-
- dim_t nimages = idims[2];
- // Multiplied in src/backend/transform.cpp
- dim_t ntransforms = odims[2] / idims[2];
-
- void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
- const af::dim4 &, const af::dim4 &,
- const dim_t, const dim_t, const dim_t, const dim_t);
-
- switch(method) {
- case AF_INTERP_NEAREST:
- t_fn = &transform_n;
- break;
- case AF_INTERP_BILINEAR:
- t_fn = &transform_b;
- break;
- case AF_INTERP_LOWER:
- t_fn = &transform_l;
- break;
- default:
- AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
- break;
- }
-
-
- // For each transform channel
- for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) {
- // Compute inverse if required
- const float *tmat_ptr = tf + t_idx * 6;
- float tmat[6];
- calc_affine_inverse(tmat, tmat_ptr, inverse);
-
- // Offset for output pointer
- dim_t o_offset = t_idx * nimages * ostrides[2];
-
- // Do transform for image
- for(int yy = 0; yy < (int)odims[1]; yy++) {
- for(int xx = 0; xx < (int)odims[0]; xx++) {
- t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy);
- }
- }
- }
-}
-
template<typename T>
Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
const af_interp_type method, const bool inverse)
@@ -114,13 +29,13 @@ Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::
switch(method) {
case AF_INTERP_NEAREST :
- getQueue().enqueue(transform_<T, AF_INTERP_NEAREST >, out, in, transform, inverse);
+ getQueue().enqueue(kernel::transform<T, AF_INTERP_NEAREST >, out, in, transform, inverse);
break;
case AF_INTERP_BILINEAR:
- getQueue().enqueue(transform_<T, AF_INTERP_BILINEAR>, out, in, transform, inverse);
+ getQueue().enqueue(kernel::transform<T, AF_INTERP_BILINEAR>, out, in, transform, inverse);
break;
case AF_INTERP_LOWER :
- getQueue().enqueue(transform_<T, AF_INTERP_LOWER >, out, in, transform, inverse);
+ getQueue().enqueue(kernel::transform<T, AF_INTERP_LOWER >, out, in, transform, inverse);
break;
default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break;
}
diff --git a/src/backend/cpu/transform_interp.hpp b/src/backend/cpu/transform_interp.hpp
index 5ad4750..d90ae38 100644
--- a/src/backend/cpu/transform_interp.hpp
+++ b/src/backend/cpu/transform_interp.hpp
@@ -7,6 +7,8 @@
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
+#pragma once
+#include <math.hpp>
#include <types.hpp>
#include <af/traits.hpp>
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp
index 7e7eec1..32663e1 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/transpose.cpp
@@ -14,7 +14,7 @@
#include <transpose.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
-
+#include <kernel/transpose.hpp>
#include <utility>
#include <cassert>
@@ -23,74 +23,6 @@ using af::dim4;
namespace cpu
{
-static inline unsigned getIdx(const dim4 &strides,
- int i, int j = 0, int k = 0, int l = 0)
-{
- return (l * strides[3] +
- k * strides[2] +
- j * strides[1] +
- i );
-}
-
-template<typename T>
-T getConjugate(const T &in)
-{
- // For non-complex types return same
- return in;
-}
-
-template<>
-cfloat getConjugate(const cfloat &in)
-{
- return std::conj(in);
-}
-
-template<>
-cdouble getConjugate(const cdouble &in)
-{
- return std::conj(in);
-}
-
-template<typename T, bool conjugate>
-void transpose_(Array<T> output, const Array<T> input)
-{
- const dim4 odims = output.dims();
- const dim4 ostrides = output.strides();
- const dim4 istrides = input.strides();
-
- T * out = output.get();
- T const * const in = input.get();
-
- for (dim_t l = 0; l < odims[3]; ++l) {
- for (dim_t k = 0; k < odims[2]; ++k) {
- // Outermost loop handles batch mode
- // if input has no data along third dimension
- // this loop runs only once
- for (dim_t j = 0; j < odims[1]; ++j) {
- for (dim_t i = 0; i < odims[0]; ++i) {
- // calculate array indices based on offsets and strides
- // the helper getIdx takes care of indices
- const dim_t inIdx = getIdx(istrides,j,i,k,l);
- const dim_t outIdx = getIdx(ostrides,i,j,k,l);
- if(conjugate)
- out[outIdx] = getConjugate(in[inIdx]);
- else
- out[outIdx] = in[inIdx];
- }
- }
- // outData and inData pointers doesn't need to be
- // offset as the getIdx function is taking care
- // of the batch parameter
- }
- }
-}
-
-template<typename T>
-void transpose_(Array<T> out, const Array<T> in, const bool conjugate)
-{
- return (conjugate ? transpose_<T, true>(out, in) : transpose_<T, false>(out, in));
-}
-
template<typename T>
Array<T> transpose(const Array<T> &in, const bool conjugate)
{
@@ -101,57 +33,16 @@ Array<T> transpose(const Array<T> &in, const bool conjugate)
// create an array with first two dimensions swapped
Array<T> out = createEmptyArray<T>(outDims);
- getQueue().enqueue(transpose_<T>, out, in, conjugate);
+ getQueue().enqueue(kernel::transpose<T>, out, in, conjugate);
return out;
}
-template<typename T, bool conjugate>
-void transpose_inplace(Array<T> input)
-{
- const dim4 idims = input.dims();
- const dim4 istrides = input.strides();
-
- T * in = input.get();
-
- for (dim_t l = 0; l < idims[3]; ++l) {
- for (dim_t k = 0; k < idims[2]; ++k) {
- // Outermost loop handles batch mode
- // if input has no data along third dimension
- // this loop runs only once
- //
- // Run only bottom triangle. std::swap swaps with upper triangle
- for (dim_t j = 0; j < idims[1]; ++j) {
- for (dim_t i = j + 1; i < idims[0]; ++i) {
- // calculate array indices based on offsets and strides
- // the helper getIdx takes care of indices
- const dim_t iIdx = getIdx(istrides,j,i,k,l);
- const dim_t oIdx = getIdx(istrides,i,j,k,l);
- if(conjugate) {
- in[iIdx] = getConjugate(in[iIdx]);
- in[oIdx] = getConjugate(in[oIdx]);
- std::swap(in[iIdx], in[oIdx]);
- }
- else {
- std::swap(in[iIdx], in[oIdx]);
- }
- }
- }
- }
- }
-}
-
-template<typename T>
-void transpose_inplace_(Array<T> in, const bool conjugate)
-{
- return (conjugate ? transpose_inplace<T, true >(in) : transpose_inplace<T, false>(in));
-}
-
template<typename T>
void transpose_inplace(Array<T> &in, const bool conjugate)
{
in.eval();
- getQueue().enqueue(transpose_inplace_<T>, in, conjugate);
+ getQueue().enqueue(kernel::transpose_inplace<T>, in, conjugate);
}
#define INSTANTIATE(T) \
diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp
index 13bee16..2a9553c 100644
--- a/src/backend/cpu/triangle.cpp
+++ b/src/backend/cpu/triangle.cpp
@@ -14,6 +14,7 @@
#include <math.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/triangle.hpp>
namespace cpu
{
@@ -21,46 +22,7 @@ namespace cpu
template<typename T, bool is_upper, bool is_unit_diag>
void triangle(Array<T> &out, const Array<T> &in)
{
- auto func = [=] (Array<T> out, const Array<T> in) {
- T *o = out.get();
- const T *i = in.get();
-
- dim4 odm = out.dims();
-
- dim4 ost = out.strides();
- dim4 ist = in.strides();
-
- for(dim_t ow = 0; ow < odm[3]; ow++) {
- const dim_t oW = ow * ost[3];
- const dim_t iW = ow * ist[3];
-
- for(dim_t oz = 0; oz < odm[2]; oz++) {
- const dim_t oZW = oW + oz * ost[2];
- const dim_t iZW = iW + oz * ist[2];
-
- for(dim_t oy = 0; oy < odm[1]; oy++) {
- const dim_t oYZW = oZW + oy * ost[1];
- const dim_t iYZW = iZW + oy * ist[1];
-
- for(dim_t ox = 0; ox < odm[0]; ox++) {
- const dim_t oMem = oYZW + ox;
- const dim_t iMem = iYZW + ox;
-
- bool cond = is_upper ? (oy >= ox) : (oy <= ox);
- bool do_unit_diag = (is_unit_diag && ox == oy);
- if(cond) {
- o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
- } else {
- o[oMem] = scalar<T>(0);
- }
-
- }
- }
- }
- }
- };
-
- getQueue().enqueue(func, out, in);
+ getQueue().enqueue(kernel::triangle<T, is_upper, is_unit_diag>, out, in);
}
template<typename T, bool is_upper, bool is_unit_diag>
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 41423c7..1aa37a4 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -9,76 +9,15 @@
#include <Array.hpp>
#include <unwrap.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <dispatch.hpp>
#include <math.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/unwrap.hpp>
namespace cpu
{
-template<typename T, int d>
-void unwrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
-{
- const T *inPtr = in.get();
- T *outPtr = out.get();
-
- af::dim4 idims = in.dims();
- af::dim4 odims = out.dims();
- af::dim4 istrides = in.strides();
- af::dim4 ostrides = out.strides();
-
- dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
-
- for(dim_t w = 0; w < odims[3]; w++) {
- for(dim_t z = 0; z < odims[2]; z++) {
-
- dim_t cOut = w * ostrides[3] + z * ostrides[2];
- dim_t cIn = w * istrides[3] + z * istrides[2];
- const T* iptr = inPtr + cIn;
- T* optr_= outPtr + cOut;
-
- for(dim_t col = 0; col < odims[d]; col++) {
- // Offset output ptr
- T* optr = optr_ + col * ostrides[d];
-
- // Calculate input window index
- dim_t winy = (col / nx);
- dim_t winx = (col % nx);
-
- dim_t startx = winx * sx;
- dim_t starty = winy * sy;
-
- dim_t spx = startx - px;
- dim_t spy = starty - py;
-
- // Short cut condition ensuring all values within input dimensions
- bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]);
-
- for(dim_t y = 0; y < wy; y++) {
- for(dim_t x = 0; x < wx; x++) {
- dim_t xpad = spx + x;
- dim_t ypad = spy + y;
-
- dim_t oloc = (y * wx + x);
- if (d == 0) oloc *= ostrides[1];
-
- if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
- dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
- optr[oloc] = iptr[iloc];
- } else {
- optr[oloc] = scalar<T>(0.0);
- }
- }
- }
- }
- }
- }
-}
-
template<typename T>
Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
@@ -98,9 +37,9 @@ Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
Array<T> outArray = createEmptyArray<T>(odims);
if (is_column) {
- getQueue().enqueue(unwrap_dim<T, 1>, outArray, in, wx, wy, sx, sy, px, py);
+ getQueue().enqueue(kernel::unwrap_dim<T, 1>, outArray, in, wx, wy, sx, sy, px, py);
} else {
- getQueue().enqueue(unwrap_dim<T, 0>, outArray, in, wx, wy, sx, sy, px, py);
+ getQueue().enqueue(kernel::unwrap_dim<T, 0>, outArray, in, wx, wy, sx, sy, px, py);
}
return outArray;
diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp
index 3ff54de..07487e0 100644
--- a/src/backend/cpu/wrap.cpp
+++ b/src/backend/cpu/wrap.cpp
@@ -9,75 +9,15 @@
#include <Array.hpp>
#include <wrap.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
#include <dispatch.hpp>
#include <math.hpp>
#include <platform.hpp>
#include <async_queue.hpp>
+#include <kernel/wrap.hpp>
namespace cpu
{
-template<typename T, int d>
-void wrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
- const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
-{
- const T *inPtr = in.get();
- T *outPtr = out.get();
-
- af::dim4 idims = in.dims();
- af::dim4 odims = out.dims();
- af::dim4 istrides = in.strides();
- af::dim4 ostrides = out.strides();
-
- dim_t nx = (odims[0] + 2 * px - wx) / sx + 1;
-
- for(dim_t w = 0; w < idims[3]; w++) {
- for(dim_t z = 0; z < idims[2]; z++) {
-
- dim_t cIn = w * istrides[3] + z * istrides[2];
- dim_t cOut = w * ostrides[3] + z * ostrides[2];
- const T* iptr_ = inPtr + cIn;
- T* optr= outPtr + cOut;
-
- for(dim_t col = 0; col < idims[d]; col++) {
- // Offset output ptr
- const T* iptr = iptr_ + col * istrides[d];
-
- // Calculate input window index
- dim_t winy = (col / nx);
- dim_t winx = (col % nx);
-
- dim_t startx = winx * sx;
- dim_t starty = winy * sy;
-
- dim_t spx = startx - px;
- dim_t spy = starty - py;
-
- // Short cut condition ensuring all values within input dimensions
- bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]);
-
- for(dim_t y = 0; y < wy; y++) {
- for(dim_t x = 0; x < wx; x++) {
- dim_t xpad = spx + x;
- dim_t ypad = spy + y;
-
- dim_t iloc = (y * wx + x);
- if (d == 0) iloc *= istrides[1];
-
- if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) {
- dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]);
- // FIXME: When using threads, atomize this
- optr[oloc] += iptr[iloc];
- }
- }
- }
- }
- }
- }
-}
-
template<typename T>
Array<T> wrap(const Array<T> &in,
const dim_t ox, const dim_t oy,
@@ -94,9 +34,9 @@ Array<T> wrap(const Array<T> &in,
in.eval();
if (is_column) {
- getQueue().enqueue(wrap_dim<T, 1>, out, in, wx, wy, sx, sy, px, py);
+ getQueue().enqueue(kernel::wrap_dim<T, 1>, out, in, wx, wy, sx, sy, px, py);
} else {
- getQueue().enqueue(wrap_dim<T, 0>, out, in, wx, wy, sx, sy, px, py);
+ getQueue().enqueue(kernel::wrap_dim<T, 0>, out, in, wx, wy, sx, sy, px, py);
}
return out;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git
More information about the debian-science-commits
mailing list