[hamradio-commits] [gnss-sdr] 227/251: Adding cuda ultra-fast correlator library. Not used yet, but optionally compiled. All CMAKEs ready!
Carles Fernandez
carles_fernandez-guest at moszumanska.debian.org
Wed Sep 2 00:22:55 UTC 2015
This is an automated email from the git hooks/post-receive script.
carles_fernandez-guest pushed a commit to branch master
in repository gnss-sdr.
commit 4fc61af172469968c4bd7821cd3633ddea077f6c
Author: Javier Arribas <javiarribas at gmail.com>
Date: Wed Jul 22 18:16:54 2015 +0200
Adding cuda ultra-fast correlator library. Not used yet, but optionally
compiled. All CMAKEs ready!
---
CMakeLists.txt | 14 +
src/algorithms/tracking/libs/CMakeLists.txt | 28 +-
.../tracking/libs/cuda_multicorrelator.cu | 418 ++++++
.../tracking/libs/cuda_multicorrelator.h | 138 ++
.../tracking/libs/cudahelpers/exception.h | 151 ++
.../tracking/libs/cudahelpers/helper_cuda.h | 1255 +++++++++++++++++
.../tracking/libs/cudahelpers/helper_cuda_drvapi.h | 517 +++++++
.../tracking/libs/cudahelpers/helper_cuda_gl.h | 165 +++
.../tracking/libs/cudahelpers/helper_functions.h | 42 +
.../tracking/libs/cudahelpers/helper_image.h | 1110 +++++++++++++++
.../tracking/libs/cudahelpers/helper_math.h | 1453 ++++++++++++++++++++
.../tracking/libs/cudahelpers/helper_string.h | 516 +++++++
.../tracking/libs/cudahelpers/helper_timer.h | 499 +++++++
13 files changed, 6304 insertions(+), 2 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6ee58a..2cc6885 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -957,6 +957,20 @@ else(ENABLE_OSMOSDR)
message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" )
endif(ENABLE_OSMOSDR)
+if($ENV{CUDA_GPU_ACCEL})
+ message(STATUS "CUDA_GPU_ACCEL environment variable found." )
+ set(ENABLE_CUDA ON)
+endif($ENV{CUDA_GPU_ACCEL})
+
+if(ENABLE_CUDA)
+ message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
+ message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
+else(ENABLE_CUDA)
+ message(STATUS "NVIDIA CUDA GPU Acceleration will is not enabled." )
+ message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for the Teleorbit Flexiband front-end." )
+endif(ENABLE_CUDA)
+
+
if($ENV{FLEXIBAND_DRIVER})
message(STATUS "FLEXIBAND_DRIVER environment variable found." )
set(ENABLE_FLEXIBAND ON)
diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt
index 52470bc..aa5fb00 100644
--- a/src/algorithms/tracking/libs/CMakeLists.txt
+++ b/src/algorithms/tracking/libs/CMakeLists.txt
@@ -16,6 +16,29 @@
# along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
#
+
+if(ENABLE_CUDA)
+ FIND_PACKAGE(CUDA REQUIRED)
+
+ # Append current NVCC flags by something, eg comput capability
+ # set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30 --default-stream-per-thread)
+
+ list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30; -std=c++11;-O3; -use_fast_math")
+ SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+ CUDA_INCLUDE_DIRECTORIES(
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers
+ )
+
+ SET(LIB_TYPE STATIC) #set the lib type
+ CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu)
+
+ set(OPT_LIBRARIES ${OPT_LIBRARIES} ${CUDA_CORRELATOR_LIB})
+
+endif(ENABLE_CUDA)
+
+
set(TRACKING_LIB_SOURCES
correlator.cc
lock_detectors.cc
@@ -24,7 +47,7 @@ set(TRACKING_LIB_SOURCES
tracking_2nd_DLL_filter.cc
tracking_2nd_PLL_filter.cc
tracking_discriminators.cc
- tracking_FLL_PLL_filter.cc
+ tracking_FLL_PLL_filter.cc
)
include_directories(
@@ -43,7 +66,8 @@ if (SSE3_AVAILABLE)
add_definitions( -DHAVE_SSE3=1 )
endif(SSE3_AVAILABLE)
+
file(GLOB TRACKING_LIB_HEADERS "*.h")
add_library(tracking_lib ${TRACKING_LIB_SOURCES} ${TRACKING_LIB_HEADERS})
source_group(Headers FILES ${TRACKING_LIB_HEADERS})
-target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES})
\ No newline at end of file
+target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES} ${OPT_LIBRARIES})
\ No newline at end of file
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
new file mode 100644
index 0000000..1909e13
--- /dev/null
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -0,0 +1,418 @@
+/*!
+ * \file cuda_multicorrelator.cu
+ * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \authors <ul>
+ * <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ * </ul>
+ *
+ * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
+// (the same as for floating point multiplication and addition),
+// whereas full 32-bit multiplication takes 16 clocks per warp.
+// So if integer multiplication operands are guaranteed to fit into 24 bits
+// (always lie withtin [-8M, 8M - 1] range in signed case),
+// explicit 24-bit multiplication is preferred for performance.
+///////////////////////////////////////////////////////////////////////////////
+#define IMUL(a, b) __mul24(a, b)
+
+#include "cuda_multicorrelator.h"
+
+#include <stdio.h>
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+// helper functions and utilities to work with CUDA
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+#define ACCUM_N 1024
+
+///////////////////////////////////////////////////////////////////////////////
+// Calculate scalar products of VectorN vectors of ElementN elements on GPU
+// Parameters restrictions:
+// 1) ElementN is strongly preferred to be a multiple of warp size to
+// meet alignment constraints of memory coalescing.
+// 2) ACCUM_N must be a power of two.
+///////////////////////////////////////////////////////////////////////////////
+
+
+__global__ void scalarProdGPUCPXxN_shifts(
+ GPU_Complex *d_corr_out,
+ GPU_Complex *d_sig_in,
+ GPU_Complex *d_local_codes_in,
+ int *d_shifts_samples,
+ int vectorN,
+ int elementN
+)
+{
+ //Accumulators cache
+ __shared__ GPU_Complex accumResult[ACCUM_N];
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Cycle through every pair of vectors,
+ // taking into account that vector counts can be different
+ // from total number of thread blocks
+ ////////////////////////////////////////////////////////////////////////////
+ for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+ {
+ int vectorBase = IMUL(elementN, vec);
+ int vectorEnd = vectorBase + elementN;
+
+ ////////////////////////////////////////////////////////////////////////
+ // Each accumulator cycles through vectors with
+ // stride equal to number of total number of accumulators ACCUM_N
+ // At this stage ACCUM_N is only preferred be a multiple of warp size
+ // to meet memory coalescing alignment constraints.
+ ////////////////////////////////////////////////////////////////////////
+ for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+ {
+ GPU_Complex sum = GPU_Complex(0,0);
+
+ for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+ {
+ //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+ //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+ sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]);
+ }
+ accumResult[iAccum] = sum;
+ }
+
+ ////////////////////////////////////////////////////////////////////////
+ // Perform tree-like reduction of accumulators' results.
+ // ACCUM_N has to be power of two at this stage
+ ////////////////////////////////////////////////////////////////////////
+ for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+ {
+ __syncthreads();
+
+ for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+ {
+ accumResult[iAccum] += accumResult[stride + iAccum];
+ }
+ }
+
+ if (threadIdx.x == 0)
+ {
+ d_corr_out[vec] = accumResult[0];
+ }
+ }
+}
+
+
+__global__ void scalarProdGPUCPXxN(
+ GPU_Complex *d_corr_out,
+ GPU_Complex *d_sig_in,
+ GPU_Complex *d_local_codes_in,
+ int vectorN,
+ int elementN
+)
+{
+ //Accumulators cache
+ __shared__ GPU_Complex accumResult[ACCUM_N];
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Cycle through every pair of vectors,
+ // taking into account that vector counts can be different
+ // from total number of thread blocks
+ ////////////////////////////////////////////////////////////////////////////
+ for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+ {
+ int vectorBase = IMUL(elementN, vec);
+ int vectorEnd = vectorBase + elementN;
+
+ ////////////////////////////////////////////////////////////////////////
+ // Each accumulator cycles through vectors with
+ // stride equal to number of total number of accumulators ACCUM_N
+ // At this stage ACCUM_N is only preferred be a multiple of warp size
+ // to meet memory coalescing alignment constraints.
+ ////////////////////////////////////////////////////////////////////////
+ for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+ {
+ GPU_Complex sum = GPU_Complex(0,0);
+
+ for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+ {
+ //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+ //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+ sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
+ }
+ accumResult[iAccum] = sum;
+ }
+
+ ////////////////////////////////////////////////////////////////////////
+ // Perform tree-like reduction of accumulators' results.
+ // ACCUM_N has to be power of two at this stage
+ ////////////////////////////////////////////////////////////////////////
+ for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+ {
+ __syncthreads();
+
+ for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+ {
+ accumResult[iAccum] += accumResult[stride + iAccum];
+ }
+ }
+
+ if (threadIdx.x == 0)
+ {
+ d_corr_out[vec] = accumResult[0];
+ }
+ }
+}
+
+
+//*********** CUDA processing **************
+// Treads: a minimal parallel execution code on GPU
+// Blocks: a set of N threads
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vectorial product of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+ if (i < numElements)
+ {
+ C[i] = A[i] * B[i];
+ }
+}
+
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel
+ */
+__global__ void
+CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
+{
+ //*** NCO CPU code (GNURadio FXP NCO)
+ //float sin_f, cos_f;
+ //float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
+ //int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad);
+ //int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad);
+ //
+ //for(int i = 0; i < d_current_prn_length_samples; i++)
+ // {
+ // gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f);
+ // d_carr_sign[i] = std::complex<float>(cos_f, -sin_f);
+ // phase_rad_i += phase_step_rad_i;
+ // }
+
+ // CUDA version of floating point NCO and vector dot product integrated
+
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ float sin;
+ float cos;
+ if (i < numElements)
+ {
+ __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
+ sig_out[i] = sig_in[i] * GPU_Complex(cos,-sin);
+ }
+}
+
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vectorial product of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void
+CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+ if (i < numElements)
+ {
+ C[i] = A[i] * B[i];
+ }
+}
+
+
+bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators)
+{
+ // use command-line specified CUDA device, otherwise use device with highest Gflops/s
+ findCudaDevice(argc, (const char **)argv);
+
+ cudaDeviceProp prop;
+ int whichDevice;
+ cudaGetDevice( &whichDevice );
+ cudaGetDeviceProperties( &prop, whichDevice );
+ //debug code
+ if (prop.canMapHostMemory != 1) {
+ printf( "Device can not map memory.\n" );
+ }
+
+ printf("L2 Cache size= %u \n",prop.l2CacheSize);
+ printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+ printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+ printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+ printf("deviceOverlap= %i \n",prop.deviceOverlap);
+ //end debug code
+
+ //checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+
+
+ // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
+
+ size_t size = signal_length_samples * sizeof(GPU_Complex);
+
+ checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
+ //checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
+ checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
+
+ // old version: all local codes are independent vectors
+ //checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
+
+ // new version: only one vector with extra samples to shift the local code for the correlator set
+ // Required: The last correlator tap in d_shifts_samples has the largest sample shift
+ checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size+sizeof(GPU_Complex)*shifts_samples[n_correlators-1]));
+ checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators));
+
+ //scalars
+ checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
+
+ // Launch the Vector Add CUDA Kernel
+ threadsPerBlock = 256;
+ blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
+
+ return true;
+}
+
+
+bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
+ std::complex<float>* corr_out,
+ const std::complex<float>* sig_in,
+ const std::complex<float>* local_codes_in,
+ float rem_carrier_phase_in_rad,
+ float phase_step_rad,
+ const int *shifts_samples,
+ int signal_length_samples,
+ int n_correlators)
+ {
+
+ cudaStream_t stream1;
+ cudaStream_t stream2;
+ cudaStreamCreate ( &stream1) ;
+ cudaStreamCreate ( &stream2) ;
+
+ size_t memSize = signal_length_samples * sizeof(std::complex<float>);
+
+ // input signal CPU -> GPU copy memory
+ checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
+ cudaMemcpyHostToDevice, stream1));
+
+ //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
+ //checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize,
+ // cudaMemcpyHostToDevice, stream1));
+
+
+ // old version: all local codes are independent vectors
+ //checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
+ // cudaMemcpyHostToDevice, stream2));
+
+ // new version: only one vector with extra samples to shift the local code for the correlator set
+ // Required: The last correlator tap in d_shifts_samples has the largest sample shift
+
+ // local code CPU -> GPU copy memory
+ checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
+ cudaMemcpyHostToDevice, stream2));
+ // Correlator shifts vector CPU -> GPU copy memory
+ checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
+ cudaMemcpyHostToDevice, stream2));
+
+
+ //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
+ checkCudaErrors(cudaStreamSynchronize(stream1));
+ CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
+
+
+ //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+
+ //wait for Doppler wipeoff end...
+ checkCudaErrors(cudaDeviceSynchronize());
+
+ //old
+// scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
+// d_corr_out,
+// d_sig_doppler_wiped,
+// d_local_codes_in,
+// 3,
+// signal_length_samples
+// );
+
+ //new
+ //launch the multitap correlator
+ scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
+ d_corr_out,
+ d_sig_doppler_wiped,
+ d_local_codes_in,
+ d_shifts_samples,
+ n_correlators,
+ signal_length_samples
+ );
+ checkCudaErrors(cudaGetLastError());
+ //wait for correlators end...
+ checkCudaErrors(cudaDeviceSynchronize());
+ // Copy the device result vector in device memory to the host result vector
+ // in host memory.
+
+ //scalar products (correlators outputs)
+ checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+ cudaMemcpyDeviceToHost, 0));
+
+ cudaStreamDestroy(stream1) ;
+ cudaStreamDestroy(stream2) ;
+ return true;
+}
+
+bool cuda_multicorrelator::free_cuda()
+{
+ // Free device global memory
+ cudaFree(d_sig_in);
+ //cudaFree(d_nco_in);
+ cudaFree(d_local_codes_in);
+ cudaFree(d_corr_out);
+
+ // Reset the device and exit
+ // cudaDeviceReset causes the driver to clean up all state. While
+ // not mandatory in normal operation, it is good practice. It is also
+ // needed to ensure correct operation when the application is being
+ // profiled. Calling cudaDeviceReset causes all profile data to be
+ // flushed before the application exits
+ checkCudaErrors(cudaDeviceReset());
+ return true;
+}
+
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h
new file mode 100644
index 0000000..8819edf
--- /dev/null
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -0,0 +1,138 @@
+/*!
+ * \file cuda_multicorrelator.h
+ * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \authors <ul>
+ * <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ * </ul>
+ *
+ * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef CUDA_MULTICORRELATOR_H_
+#define CUDA_MULTICORRELATOR_H_
+
+
+#ifdef __CUDACC__
+#define CUDA_CALLABLE_MEMBER_GLOBAL __global__
+#define CUDA_CALLABLE_MEMBER_DEVICE __device__
+#else
+#define CUDA_CALLABLE_MEMBER_GLOBAL
+#define CUDA_CALLABLE_MEMBER_DEVICE
+#endif
+
+#include <complex>
+
+// GPU new internal data types for complex numbers
+
+struct GPU_Complex {
+ float r;
+ float i;
+ CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
+ CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
+ CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
+ return r * r + i * i;
+ }
+ CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
+ #ifdef __CUDACC__
+ return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
+ #else
+ return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
+ #endif
+ }
+ CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
+ return GPU_Complex(r+a.r, i+a.i);
+ }
+ CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
+ r+=a.r;
+ i+=a.i;
+ }
+ CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
+ {
+ //c=a*b+c
+ //real part
+ //c.r=(a.r*b.r - a.i*b.i)+c.r
+ #ifdef __CUDACC__
+ r=__fmaf_rn(a.r,b.r,r);
+ r=__fmaf_rn(-a.i,b.i,r);
+ //imag part
+ i=__fmaf_rn(a.i,b.r,i);
+ i=__fmaf_rn(a.r,b.i,i);
+ #else
+ r=(a.r*b.r - a.i*b.i)+r;
+ i=(a.i*b.r - a.r*b.i)+i;
+ #endif
+
+ }
+};
+
+struct GPU_Complex_Short {
+ float r;
+ float i;
+ CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
+ CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
+ return r * r + i * i;
+ }
+ CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
+ return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
+ }
+ CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
+ return GPU_Complex_Short(r+a.r, i+a.i);
+ }
+};
+/*!
+ * \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators.
+ */
+class cuda_multicorrelator
+{
+public:
+ bool init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators);
+
+ bool free_cuda();
+ bool Carrier_wipeoff_multicorrelator_cuda(
+ std::complex<float>* corr_out,
+ const std::complex<float>* sig_in,
+ const std::complex<float>* local_codes_in,
+ float rem_carrier_phase_in_rad,
+ float phase_step_rad,
+ const int *shifts_samples,
+ int signal_length_samples,
+ int n_correlators);
+private:
+ // Allocate the device input vectors
+ GPU_Complex *d_sig_in;
+ GPU_Complex *d_nco_in;
+ GPU_Complex *d_sig_doppler_wiped;
+ GPU_Complex *d_local_codes_in;
+ GPU_Complex *d_corr_out;
+ int *d_shifts_samples;
+ int threadsPerBlock;
+ int blocksPerGrid;
+
+};
+
+
+#endif /* CUDA_MULTICORRELATOR_H_ */
diff --git a/src/algorithms/tracking/libs/cudahelpers/exception.h b/src/algorithms/tracking/libs/cudahelpers/exception.h
new file mode 100644
index 0000000..adda4bc
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/exception.h
@@ -0,0 +1,151 @@
+/*
+* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+/* CUda UTility Library */
+#ifndef _EXCEPTION_H_
+#define _EXCEPTION_H_
+
+// includes, system
+#include <exception>
+#include <stdexcept>
+#include <iostream>
+#include <stdlib.h>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template<class Std_Exception>
+class Exception : public Std_Exception
+{
+ public:
+
+ //! @brief Static construction interface
+ //! @return Alwayss throws ( Located_Exception<Exception>)
+ //! @param file file in which the Exception occurs
+ //! @param line line in which the Exception occurs
+ //! @param detailed details on the code fragment causing the Exception
+ static void throw_it(const char *file,
+ const int line,
+ const char *detailed = "-");
+
+ //! Static construction interface
+ //! @return Alwayss throws ( Located_Exception<Exception>)
+ //! @param file file in which the Exception occurs
+ //! @param line line in which the Exception occurs
+ //! @param detailed details on the code fragment causing the Exception
+ static void throw_it(const char *file,
+ const int line,
+ const std::string &detailed);
+
+ //! Destructor
+ virtual ~Exception() throw();
+
+ private:
+
+ //! Constructor, default (private)
+ Exception();
+
+ //! Constructor, standard
+ //! @param str string returned by what()
+ Exception(const std::string &str);
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template<class Exception_Typ>
+inline void
+handleException(const Exception_Typ &ex)
+{
+ std::cerr << ex.what() << std::endl;
+
+ exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION( msg) \
+ Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION( msg) \
+ Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION( msg) \
+ Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it(const char *file, const int line, const char *detailed)
+{
+ std::stringstream s;
+
+ // Quiet heavy-weight but exceptions are not for
+ // performance / release versions
+ s << "Exception in file '" << file << "' in line " << line << "\n"
+ << "Detailed description: " << detailed << "\n";
+
+ throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it(const char *file, const int line, const std::string &msg)
+{
+ throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception() :
+ Std_Exception("Unknown Exception.\n")
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) :
+ Std_Exception(s)
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() { }
+
+// functions, exported
+
+#endif // #ifndef _EXCEPTION_H_
+
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h b/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h
new file mode 100644
index 0000000..1d3e920
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h
@@ -0,0 +1,1255 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef HELPER_CUDA_H
+#define HELPER_CUDA_H
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header files, please
+// refer the CUDA examples for examples of the needed CUDA headers, which may change depending
+// on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error)
+{
+ switch (error)
+ {
+ case cudaSuccess:
+ return "cudaSuccess";
+
+ case cudaErrorMissingConfiguration:
+ return "cudaErrorMissingConfiguration";
+
+ case cudaErrorMemoryAllocation:
+ return "cudaErrorMemoryAllocation";
+
+ case cudaErrorInitializationError:
+ return "cudaErrorInitializationError";
+
+ case cudaErrorLaunchFailure:
+ return "cudaErrorLaunchFailure";
+
+ case cudaErrorPriorLaunchFailure:
+ return "cudaErrorPriorLaunchFailure";
+
+ case cudaErrorLaunchTimeout:
+ return "cudaErrorLaunchTimeout";
+
+ case cudaErrorLaunchOutOfResources:
+ return "cudaErrorLaunchOutOfResources";
+
+ case cudaErrorInvalidDeviceFunction:
+ return "cudaErrorInvalidDeviceFunction";
+
+ case cudaErrorInvalidConfiguration:
+ return "cudaErrorInvalidConfiguration";
+
+ case cudaErrorInvalidDevice:
+ return "cudaErrorInvalidDevice";
+
+ case cudaErrorInvalidValue:
+ return "cudaErrorInvalidValue";
+
+ case cudaErrorInvalidPitchValue:
+ return "cudaErrorInvalidPitchValue";
+
+ case cudaErrorInvalidSymbol:
+ return "cudaErrorInvalidSymbol";
+
+ case cudaErrorMapBufferObjectFailed:
+ return "cudaErrorMapBufferObjectFailed";
+
+ case cudaErrorUnmapBufferObjectFailed:
+ return "cudaErrorUnmapBufferObjectFailed";
+
+ case cudaErrorInvalidHostPointer:
+ return "cudaErrorInvalidHostPointer";
+
+ case cudaErrorInvalidDevicePointer:
+ return "cudaErrorInvalidDevicePointer";
+
+ case cudaErrorInvalidTexture:
+ return "cudaErrorInvalidTexture";
+
+ case cudaErrorInvalidTextureBinding:
+ return "cudaErrorInvalidTextureBinding";
+
+ case cudaErrorInvalidChannelDescriptor:
+ return "cudaErrorInvalidChannelDescriptor";
+
+ case cudaErrorInvalidMemcpyDirection:
+ return "cudaErrorInvalidMemcpyDirection";
+
+ case cudaErrorAddressOfConstant:
+ return "cudaErrorAddressOfConstant";
+
+ case cudaErrorTextureFetchFailed:
+ return "cudaErrorTextureFetchFailed";
+
+ case cudaErrorTextureNotBound:
+ return "cudaErrorTextureNotBound";
+
+ case cudaErrorSynchronizationError:
+ return "cudaErrorSynchronizationError";
+
+ case cudaErrorInvalidFilterSetting:
+ return "cudaErrorInvalidFilterSetting";
+
+ case cudaErrorInvalidNormSetting:
+ return "cudaErrorInvalidNormSetting";
+
+ case cudaErrorMixedDeviceExecution:
+ return "cudaErrorMixedDeviceExecution";
+
+ case cudaErrorCudartUnloading:
+ return "cudaErrorCudartUnloading";
+
+ case cudaErrorUnknown:
+ return "cudaErrorUnknown";
+
+ case cudaErrorNotYetImplemented:
+ return "cudaErrorNotYetImplemented";
+
+ case cudaErrorMemoryValueTooLarge:
+ return "cudaErrorMemoryValueTooLarge";
+
+ case cudaErrorInvalidResourceHandle:
+ return "cudaErrorInvalidResourceHandle";
+
+ case cudaErrorNotReady:
+ return "cudaErrorNotReady";
+
+ case cudaErrorInsufficientDriver:
+ return "cudaErrorInsufficientDriver";
+
+ case cudaErrorSetOnActiveProcess:
+ return "cudaErrorSetOnActiveProcess";
+
+ case cudaErrorInvalidSurface:
+ return "cudaErrorInvalidSurface";
+
+ case cudaErrorNoDevice:
+ return "cudaErrorNoDevice";
+
+ case cudaErrorECCUncorrectable:
+ return "cudaErrorECCUncorrectable";
+
+ case cudaErrorSharedObjectSymbolNotFound:
+ return "cudaErrorSharedObjectSymbolNotFound";
+
+ case cudaErrorSharedObjectInitFailed:
+ return "cudaErrorSharedObjectInitFailed";
+
+ case cudaErrorUnsupportedLimit:
+ return "cudaErrorUnsupportedLimit";
+
+ case cudaErrorDuplicateVariableName:
+ return "cudaErrorDuplicateVariableName";
+
+ case cudaErrorDuplicateTextureName:
+ return "cudaErrorDuplicateTextureName";
+
+ case cudaErrorDuplicateSurfaceName:
+ return "cudaErrorDuplicateSurfaceName";
+
+ case cudaErrorDevicesUnavailable:
+ return "cudaErrorDevicesUnavailable";
+
+ case cudaErrorInvalidKernelImage:
+ return "cudaErrorInvalidKernelImage";
+
+ case cudaErrorNoKernelImageForDevice:
+ return "cudaErrorNoKernelImageForDevice";
+
+ case cudaErrorIncompatibleDriverContext:
+ return "cudaErrorIncompatibleDriverContext";
+
+ case cudaErrorPeerAccessAlreadyEnabled:
+ return "cudaErrorPeerAccessAlreadyEnabled";
+
+ case cudaErrorPeerAccessNotEnabled:
+ return "cudaErrorPeerAccessNotEnabled";
+
+ case cudaErrorDeviceAlreadyInUse:
+ return "cudaErrorDeviceAlreadyInUse";
+
+ case cudaErrorProfilerDisabled:
+ return "cudaErrorProfilerDisabled";
+
+ case cudaErrorProfilerNotInitialized:
+ return "cudaErrorProfilerNotInitialized";
+
+ case cudaErrorProfilerAlreadyStarted:
+ return "cudaErrorProfilerAlreadyStarted";
+
+ case cudaErrorProfilerAlreadyStopped:
+ return "cudaErrorProfilerAlreadyStopped";
+
+ /* Since CUDA 4.0*/
+ case cudaErrorAssert:
+ return "cudaErrorAssert";
+
+ case cudaErrorTooManyPeers:
+ return "cudaErrorTooManyPeers";
+
+ case cudaErrorHostMemoryAlreadyRegistered:
+ return "cudaErrorHostMemoryAlreadyRegistered";
+
+ case cudaErrorHostMemoryNotRegistered:
+ return "cudaErrorHostMemoryNotRegistered";
+
+ /* Since CUDA 5.0 */
+ case cudaErrorOperatingSystem:
+ return "cudaErrorOperatingSystem";
+
+ case cudaErrorPeerAccessUnsupported:
+ return "cudaErrorPeerAccessUnsupported";
+
+ case cudaErrorLaunchMaxDepthExceeded:
+ return "cudaErrorLaunchMaxDepthExceeded";
+
+ case cudaErrorLaunchFileScopedTex:
+ return "cudaErrorLaunchFileScopedTex";
+
+ case cudaErrorLaunchFileScopedSurf:
+ return "cudaErrorLaunchFileScopedSurf";
+
+ case cudaErrorSyncDepthExceeded:
+ return "cudaErrorSyncDepthExceeded";
+
+ case cudaErrorLaunchPendingCountExceeded:
+ return "cudaErrorLaunchPendingCountExceeded";
+
+ case cudaErrorNotPermitted:
+ return "cudaErrorNotPermitted";
+
+ case cudaErrorNotSupported:
+ return "cudaErrorNotSupported";
+
+ /* Since CUDA 6.0 */
+ case cudaErrorHardwareStackError:
+ return "cudaErrorHardwareStackError";
+
+ case cudaErrorIllegalInstruction:
+ return "cudaErrorIllegalInstruction";
+
+ case cudaErrorMisalignedAddress:
+ return "cudaErrorMisalignedAddress";
+
+ case cudaErrorInvalidAddressSpace:
+ return "cudaErrorInvalidAddressSpace";
+
+ case cudaErrorInvalidPc:
+ return "cudaErrorInvalidPc";
+
+ case cudaErrorIllegalAddress:
+ return "cudaErrorIllegalAddress";
+
+ /* Since CUDA 6.5*/
+ case cudaErrorInvalidPtx:
+ return "cudaErrorInvalidPtx";
+
+ case cudaErrorInvalidGraphicsContext:
+ return "cudaErrorInvalidGraphicsContext";
+
+ case cudaErrorStartupFailure:
+ return "cudaErrorStartupFailure";
+
+ case cudaErrorApiFailureBase:
+ return "cudaErrorApiFailureBase";
+ }
+
+ return "<unknown>";
+}
+#endif
+
+#ifdef __cuda_cuda_h__
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error)
+{
+ switch (error)
+ {
+ case CUDA_SUCCESS:
+ return "CUDA_SUCCESS";
+
+ case CUDA_ERROR_INVALID_VALUE:
+ return "CUDA_ERROR_INVALID_VALUE";
+
+ case CUDA_ERROR_OUT_OF_MEMORY:
+ return "CUDA_ERROR_OUT_OF_MEMORY";
+
+ case CUDA_ERROR_NOT_INITIALIZED:
+ return "CUDA_ERROR_NOT_INITIALIZED";
+
+ case CUDA_ERROR_DEINITIALIZED:
+ return "CUDA_ERROR_DEINITIALIZED";
+
+ case CUDA_ERROR_PROFILER_DISABLED:
+ return "CUDA_ERROR_PROFILER_DISABLED";
+
+ case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+ return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+
+ case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+ return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+
+ case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+ return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+
+ case CUDA_ERROR_NO_DEVICE:
+ return "CUDA_ERROR_NO_DEVICE";
+
+ case CUDA_ERROR_INVALID_DEVICE:
+ return "CUDA_ERROR_INVALID_DEVICE";
+
+ case CUDA_ERROR_INVALID_IMAGE:
+ return "CUDA_ERROR_INVALID_IMAGE";
+
+ case CUDA_ERROR_INVALID_CONTEXT:
+ return "CUDA_ERROR_INVALID_CONTEXT";
+
+ case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+ return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+
+ case CUDA_ERROR_MAP_FAILED:
+ return "CUDA_ERROR_MAP_FAILED";
+
+ case CUDA_ERROR_UNMAP_FAILED:
+ return "CUDA_ERROR_UNMAP_FAILED";
+
+ case CUDA_ERROR_ARRAY_IS_MAPPED:
+ return "CUDA_ERROR_ARRAY_IS_MAPPED";
+
+ case CUDA_ERROR_ALREADY_MAPPED:
+ return "CUDA_ERROR_ALREADY_MAPPED";
+
+ case CUDA_ERROR_NO_BINARY_FOR_GPU:
+ return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+
+ case CUDA_ERROR_ALREADY_ACQUIRED:
+ return "CUDA_ERROR_ALREADY_ACQUIRED";
+
+ case CUDA_ERROR_NOT_MAPPED:
+ return "CUDA_ERROR_NOT_MAPPED";
+
+ case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+ return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+
+ case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+ return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+
+ case CUDA_ERROR_ECC_UNCORRECTABLE:
+ return "CUDA_ERROR_ECC_UNCORRECTABLE";
+
+ case CUDA_ERROR_UNSUPPORTED_LIMIT:
+ return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+
+ case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+ return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+
+ case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+ return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
+
+ case CUDA_ERROR_INVALID_PTX:
+ return "CUDA_ERROR_INVALID_PTX";
+
+ case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+ return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
+
+ case CUDA_ERROR_INVALID_SOURCE:
+ return "CUDA_ERROR_INVALID_SOURCE";
+
+ case CUDA_ERROR_FILE_NOT_FOUND:
+ return "CUDA_ERROR_FILE_NOT_FOUND";
+
+ case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+ return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+
+ case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+ return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+
+ case CUDA_ERROR_OPERATING_SYSTEM:
+ return "CUDA_ERROR_OPERATING_SYSTEM";
+
+ case CUDA_ERROR_INVALID_HANDLE:
+ return "CUDA_ERROR_INVALID_HANDLE";
+
+ case CUDA_ERROR_NOT_FOUND:
+ return "CUDA_ERROR_NOT_FOUND";
+
+ case CUDA_ERROR_NOT_READY:
+ return "CUDA_ERROR_NOT_READY";
+
+ case CUDA_ERROR_ILLEGAL_ADDRESS:
+ return "CUDA_ERROR_ILLEGAL_ADDRESS";
+
+ case CUDA_ERROR_LAUNCH_FAILED:
+ return "CUDA_ERROR_LAUNCH_FAILED";
+
+ case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+ return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+
+ case CUDA_ERROR_LAUNCH_TIMEOUT:
+ return "CUDA_ERROR_LAUNCH_TIMEOUT";
+
+ case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+ return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+
+ case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+ return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+
+ case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+ return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+
+ case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+ return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+
+ case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+ return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+
+ case CUDA_ERROR_ASSERT:
+ return "CUDA_ERROR_ASSERT";
+
+ case CUDA_ERROR_TOO_MANY_PEERS:
+ return "CUDA_ERROR_TOO_MANY_PEERS";
+
+ case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+ return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+
+ case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+ return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+
+ case CUDA_ERROR_HARDWARE_STACK_ERROR:
+ return "CUDA_ERROR_HARDWARE_STACK_ERROR";
+
+ case CUDA_ERROR_ILLEGAL_INSTRUCTION:
+ return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+
+ case CUDA_ERROR_MISALIGNED_ADDRESS:
+ return "CUDA_ERROR_MISALIGNED_ADDRESS";
+
+ case CUDA_ERROR_INVALID_ADDRESS_SPACE:
+ return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
+
+ case CUDA_ERROR_INVALID_PC:
+ return "CUDA_ERROR_INVALID_PC";
+
+ case CUDA_ERROR_NOT_PERMITTED:
+ return "CUDA_ERROR_NOT_PERMITTED";
+
+ case CUDA_ERROR_NOT_SUPPORTED:
+ return "CUDA_ERROR_NOT_SUPPORTED";
+
+ case CUDA_ERROR_UNKNOWN:
+ return "CUDA_ERROR_UNKNOWN";
+ }
+
+ return "<unknown>";
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error)
+{
+ switch (error)
+ {
+ case CUBLAS_STATUS_SUCCESS:
+ return "CUBLAS_STATUS_SUCCESS";
+
+ case CUBLAS_STATUS_NOT_INITIALIZED:
+ return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+ case CUBLAS_STATUS_ALLOC_FAILED:
+ return "CUBLAS_STATUS_ALLOC_FAILED";
+
+ case CUBLAS_STATUS_INVALID_VALUE:
+ return "CUBLAS_STATUS_INVALID_VALUE";
+
+ case CUBLAS_STATUS_ARCH_MISMATCH:
+ return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+ case CUBLAS_STATUS_MAPPING_ERROR:
+ return "CUBLAS_STATUS_MAPPING_ERROR";
+
+ case CUBLAS_STATUS_EXECUTION_FAILED:
+ return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+ case CUBLAS_STATUS_INTERNAL_ERROR:
+ return "CUBLAS_STATUS_INTERNAL_ERROR";
+ }
+
+ return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error)
+{
+ switch (error)
+ {
+ case CUFFT_SUCCESS:
+ return "CUFFT_SUCCESS";
+
+ case CUFFT_INVALID_PLAN:
+ return "CUFFT_INVALID_PLAN";
+
+ case CUFFT_ALLOC_FAILED:
+ return "CUFFT_ALLOC_FAILED";
+
+ case CUFFT_INVALID_TYPE:
+ return "CUFFT_INVALID_TYPE";
+
+ case CUFFT_INVALID_VALUE:
+ return "CUFFT_INVALID_VALUE";
+
+ case CUFFT_INTERNAL_ERROR:
+ return "CUFFT_INTERNAL_ERROR";
+
+ case CUFFT_EXEC_FAILED:
+ return "CUFFT_EXEC_FAILED";
+
+ case CUFFT_SETUP_FAILED:
+ return "CUFFT_SETUP_FAILED";
+
+ case CUFFT_INVALID_SIZE:
+ return "CUFFT_INVALID_SIZE";
+
+ case CUFFT_UNALIGNED_DATA:
+ return "CUFFT_UNALIGNED_DATA";
+
+ case CUFFT_INCOMPLETE_PARAMETER_LIST:
+ return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+ case CUFFT_INVALID_DEVICE:
+ return "CUFFT_INVALID_DEVICE";
+
+ case CUFFT_PARSE_ERROR:
+ return "CUFFT_PARSE_ERROR";
+
+ case CUFFT_NO_WORKSPACE:
+ return "CUFFT_NO_WORKSPACE";
+
+ case CUFFT_NOT_IMPLEMENTED:
+ return "CUFFT_NOT_IMPLEMENTED";
+
+ case CUFFT_LICENSE_ERROR:
+ return "CUFFT_LICENSE_ERROR";
+ }
+
+ return "<unknown>";
+}
+#endif
+
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error)
+{
+ switch (error)
+ {
+ case CUSPARSE_STATUS_SUCCESS:
+ return "CUSPARSE_STATUS_SUCCESS";
+
+ case CUSPARSE_STATUS_NOT_INITIALIZED:
+ return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+ case CUSPARSE_STATUS_ALLOC_FAILED:
+ return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+ case CUSPARSE_STATUS_INVALID_VALUE:
+ return "CUSPARSE_STATUS_INVALID_VALUE";
+
+ case CUSPARSE_STATUS_ARCH_MISMATCH:
+ return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+ case CUSPARSE_STATUS_MAPPING_ERROR:
+ return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+ case CUSPARSE_STATUS_EXECUTION_FAILED:
+ return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+ case CUSPARSE_STATUS_INTERNAL_ERROR:
+ return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+ case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+ return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+ }
+
+ return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+//cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error)
+{
+ switch(error)
+ {
+ case CUSOLVER_STATUS_SUCCESS:
+ return "CUSOLVER_STATUS_SUCCESS";
+ case CUSOLVER_STATUS_NOT_INITIALIZED:
+ return "CUSOLVER_STATUS_NOT_INITIALIZED";
+ case CUSOLVER_STATUS_ALLOC_FAILED:
+ return "CUSOLVER_STATUS_ALLOC_FAILED";
+ case CUSOLVER_STATUS_INVALID_VALUE:
+ return "CUSOLVER_STATUS_INVALID_VALUE";
+ case CUSOLVER_STATUS_ARCH_MISMATCH:
+ return "CUSOLVER_STATUS_ARCH_MISMATCH";
+ case CUSOLVER_STATUS_MAPPING_ERROR:
+ return "CUSOLVER_STATUS_MAPPING_ERROR";
+ case CUSOLVER_STATUS_EXECUTION_FAILED:
+ return "CUSOLVER_STATUS_EXECUTION_FAILED";
+ case CUSOLVER_STATUS_INTERNAL_ERROR:
+ return "CUSOLVER_STATUS_INTERNAL_ERROR";
+ case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+ return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+ case CUSOLVER_STATUS_NOT_SUPPORTED :
+ return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+ case CUSOLVER_STATUS_ZERO_PIVOT:
+ return "CUSOLVER_STATUS_ZERO_PIVOT";
+ case CUSOLVER_STATUS_INVALID_LICENSE:
+ return "CUSOLVER_STATUS_INVALID_LICENSE";
+ }
+
+ return "<unknown>";
+
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error)
+{
+ switch (error)
+ {
+ case CURAND_STATUS_SUCCESS:
+ return "CURAND_STATUS_SUCCESS";
+
+ case CURAND_STATUS_VERSION_MISMATCH:
+ return "CURAND_STATUS_VERSION_MISMATCH";
+
+ case CURAND_STATUS_NOT_INITIALIZED:
+ return "CURAND_STATUS_NOT_INITIALIZED";
+
+ case CURAND_STATUS_ALLOCATION_FAILED:
+ return "CURAND_STATUS_ALLOCATION_FAILED";
+
+ case CURAND_STATUS_TYPE_ERROR:
+ return "CURAND_STATUS_TYPE_ERROR";
+
+ case CURAND_STATUS_OUT_OF_RANGE:
+ return "CURAND_STATUS_OUT_OF_RANGE";
+
+ case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+ return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+ case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+ return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+ case CURAND_STATUS_LAUNCH_FAILURE:
+ return "CURAND_STATUS_LAUNCH_FAILURE";
+
+ case CURAND_STATUS_PREEXISTING_FAILURE:
+ return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+ case CURAND_STATUS_INITIALIZATION_FAILED:
+ return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+ case CURAND_STATUS_ARCH_MISMATCH:
+ return "CURAND_STATUS_ARCH_MISMATCH";
+
+ case CURAND_STATUS_INTERNAL_ERROR:
+ return "CURAND_STATUS_INTERNAL_ERROR";
+ }
+
+ return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error)
+{
+ switch (error)
+ {
+ case NPP_NOT_SUPPORTED_MODE_ERROR:
+ return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+ case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+ return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+ case NPP_RESIZE_NO_OPERATION_ERROR:
+ return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+ case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+ return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+ case NPP_BAD_ARG_ERROR:
+ return "NPP_BAD_ARGUMENT_ERROR";
+
+ case NPP_COEFF_ERROR:
+ return "NPP_COEFFICIENT_ERROR";
+
+ case NPP_RECT_ERROR:
+ return "NPP_RECTANGLE_ERROR";
+
+ case NPP_QUAD_ERROR:
+ return "NPP_QUADRANGLE_ERROR";
+
+ case NPP_MEM_ALLOC_ERR:
+ return "NPP_MEMORY_ALLOCATION_ERROR";
+
+ case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+ return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+ case NPP_INVALID_INPUT:
+ return "NPP_INVALID_INPUT";
+
+ case NPP_POINTER_ERROR:
+ return "NPP_POINTER_ERROR";
+
+ case NPP_WARNING:
+ return "NPP_WARNING";
+
+ case NPP_ODD_ROI_WARNING:
+ return "NPP_ODD_ROI_WARNING";
+#else
+
+ // These are for CUDA 5.5 or higher
+ case NPP_BAD_ARGUMENT_ERROR:
+ return "NPP_BAD_ARGUMENT_ERROR";
+
+ case NPP_COEFFICIENT_ERROR:
+ return "NPP_COEFFICIENT_ERROR";
+
+ case NPP_RECTANGLE_ERROR:
+ return "NPP_RECTANGLE_ERROR";
+
+ case NPP_QUADRANGLE_ERROR:
+ return "NPP_QUADRANGLE_ERROR";
+
+ case NPP_MEMORY_ALLOCATION_ERR:
+ return "NPP_MEMORY_ALLOCATION_ERROR";
+
+ case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+ return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+ case NPP_INVALID_HOST_POINTER_ERROR:
+ return "NPP_INVALID_HOST_POINTER_ERROR";
+
+ case NPP_INVALID_DEVICE_POINTER_ERROR:
+ return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+ case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+ return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+ case NPP_TEXTURE_BIND_ERROR:
+ return "NPP_TEXTURE_BIND_ERROR";
+
+ case NPP_WRONG_INTERSECTION_ROI_ERROR:
+ return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+ case NPP_NOT_EVEN_STEP_ERROR:
+ return "NPP_NOT_EVEN_STEP_ERROR";
+
+ case NPP_INTERPOLATION_ERROR:
+ return "NPP_INTERPOLATION_ERROR";
+
+ case NPP_RESIZE_FACTOR_ERROR:
+ return "NPP_RESIZE_FACTOR_ERROR";
+
+ case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+ return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+ case NPP_MEMFREE_ERR:
+ return "NPP_MEMFREE_ERR";
+
+ case NPP_MEMSET_ERR:
+ return "NPP_MEMSET_ERR";
+
+ case NPP_MEMCPY_ERR:
+ return "NPP_MEMCPY_ERROR";
+
+ case NPP_MIRROR_FLIP_ERR:
+ return "NPP_MIRROR_FLIP_ERR";
+#else
+
+ case NPP_MEMFREE_ERROR:
+ return "NPP_MEMFREE_ERROR";
+
+ case NPP_MEMSET_ERROR:
+ return "NPP_MEMSET_ERROR";
+
+ case NPP_MEMCPY_ERROR:
+ return "NPP_MEMCPY_ERROR";
+
+ case NPP_MIRROR_FLIP_ERROR:
+ return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+ case NPP_ALIGNMENT_ERROR:
+ return "NPP_ALIGNMENT_ERROR";
+
+ case NPP_STEP_ERROR:
+ return "NPP_STEP_ERROR";
+
+ case NPP_SIZE_ERROR:
+ return "NPP_SIZE_ERROR";
+
+ case NPP_NULL_POINTER_ERROR:
+ return "NPP_NULL_POINTER_ERROR";
+
+ case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+ return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+ case NPP_NOT_IMPLEMENTED_ERROR:
+ return "NPP_NOT_IMPLEMENTED_ERROR";
+
+ case NPP_ERROR:
+ return "NPP_ERROR";
+
+ case NPP_SUCCESS:
+ return "NPP_SUCCESS";
+
+ case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+ return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+ case NPP_MISALIGNED_DST_ROI_WARNING:
+ return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+ case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+ return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+ case NPP_DOUBLE_SIZE_WARNING:
+ return "NPP_DOUBLE_SIZE_WARNING";
+
+ case NPP_WRONG_INTERSECTION_ROI_WARNING:
+ return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+ /* These are 6.0 or higher */
+ case NPP_LUT_PALETTE_BITSIZE_ERROR:
+ return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+ case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+ return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+ case NPP_QUALITY_INDEX_ERROR:
+ return "NPP_QUALITY_INDEX_ERROR";
+
+ case NPP_CHANNEL_ORDER_ERROR:
+ return "NPP_CHANNEL_ORDER_ERROR";
+
+ case NPP_ZERO_MASK_VALUE_ERROR:
+ return "NPP_ZERO_MASK_VALUE_ERROR";
+
+ case NPP_NUMBER_OF_CHANNELS_ERROR:
+ return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+ case NPP_COI_ERROR:
+ return "NPP_COI_ERROR";
+
+ case NPP_DIVISOR_ERROR:
+ return "NPP_DIVISOR_ERROR";
+
+ case NPP_CHANNEL_ERROR:
+ return "NPP_CHANNEL_ERROR";
+
+ case NPP_STRIDE_ERROR:
+ return "NPP_STRIDE_ERROR";
+
+ case NPP_ANCHOR_ERROR:
+ return "NPP_ANCHOR_ERROR";
+
+ case NPP_MASK_SIZE_ERROR:
+ return "NPP_MASK_SIZE_ERROR";
+
+ case NPP_MOMENT_00_ZERO_ERROR:
+ return "NPP_MOMENT_00_ZERO_ERROR";
+
+ case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+ return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+ case NPP_THRESHOLD_ERROR:
+ return "NPP_THRESHOLD_ERROR";
+
+ case NPP_CONTEXT_MATCH_ERROR:
+ return "NPP_CONTEXT_MATCH_ERROR";
+
+ case NPP_FFT_FLAG_ERROR:
+ return "NPP_FFT_FLAG_ERROR";
+
+ case NPP_FFT_ORDER_ERROR:
+ return "NPP_FFT_ORDER_ERROR";
+
+ case NPP_SCALE_RANGE_ERROR:
+ return "NPP_SCALE_RANGE_ERROR";
+
+ case NPP_DATA_TYPE_ERROR:
+ return "NPP_DATA_TYPE_ERROR";
+
+ case NPP_OUT_OFF_RANGE_ERROR:
+ return "NPP_OUT_OFF_RANGE_ERROR";
+
+ case NPP_DIVIDE_BY_ZERO_ERROR:
+ return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+ case NPP_RANGE_ERROR:
+ return "NPP_RANGE_ERROR";
+
+ case NPP_NO_MEMORY_ERROR:
+ return "NPP_NO_MEMORY_ERROR";
+
+ case NPP_ERROR_RESERVED:
+ return "NPP_ERROR_RESERVED";
+
+ case NPP_NO_OPERATION_WARNING:
+ return "NPP_NO_OPERATION_WARNING";
+
+ case NPP_DIVIDE_BY_ZERO_WARNING:
+ return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+ }
+
+ return "<unknown>";
+}
+#endif
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+template< typename T >
+void check(T result, char const *const func, const char *const file, int const line)
+{
+ if (result)
+ {
+ fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+ file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+ DEVICE_RESET
+ // Make sure we call CUDA Device Reset before exiting
+ exit(EXIT_FAILURE);
+ }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
+{
+ cudaError_t err = cudaGetLastError();
+
+ if (cudaSuccess != err)
+ {
+ fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
+ file, line, errorMessage, (int)err, cudaGetErrorString(err));
+ DEVICE_RESET
+ exit(EXIT_FAILURE);
+ }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a,b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value)
+{
+ return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor)
+{
+ // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+ typedef struct
+ {
+ int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+ int Cores;
+ } sSMtoCores;
+
+ sSMtoCores nGpuArchCoresPerSM[] =
+ {
+ { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+ { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+ { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+ { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+ { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+ { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+ { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+ { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+ { -1, -1 }
+ };
+
+ int index = 0;
+
+ while (nGpuArchCoresPerSM[index].SM != -1)
+ {
+ if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+ {
+ return nGpuArchCoresPerSM[index].Cores;
+ }
+
+ index++;
+ }
+
+ // If we don't find the values, we default use the previous one to run properly
+ printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+ return nGpuArchCoresPerSM[index-1].Cores;
+}
+// end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID)
+{
+ int device_count;
+ checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+ if (device_count == 0)
+ {
+ fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (devID < 0)
+ {
+ devID = 0;
+ }
+
+ if (devID > device_count-1)
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
+ fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
+ fprintf(stderr, "\n");
+ return -devID;
+ }
+
+ cudaDeviceProp deviceProp;
+ checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+
+ if (deviceProp.computeMode == cudaComputeModeProhibited)
+ {
+ fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+ return -1;
+ }
+
+ if (deviceProp.major < 1)
+ {
+ fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ checkCudaErrors(cudaSetDevice(devID));
+ printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
+
+ return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId()
+{
+ int current_device = 0, sm_per_multiproc = 0;
+ int max_perf_device = 0;
+ int device_count = 0, best_SM_arch = 0;
+ int devices_prohibited = 0;
+
+ unsigned long long max_compute_perf = 0;
+ cudaDeviceProp deviceProp;
+ cudaGetDeviceCount(&device_count);
+
+ checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+ if (device_count == 0)
+ {
+ fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // Find the best major SM Architecture GPU device
+ while (current_device < device_count)
+ {
+ cudaGetDeviceProperties(&deviceProp, current_device);
+
+ // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+ if (deviceProp.computeMode != cudaComputeModeProhibited)
+ {
+ if (deviceProp.major > 0 && deviceProp.major < 9999)
+ {
+ best_SM_arch = MAX(best_SM_arch, deviceProp.major);
+ }
+ }
+ else
+ {
+ devices_prohibited++;
+ }
+
+ current_device++;
+ }
+
+ if (devices_prohibited == device_count)
+ {
+ fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // Find the best CUDA capable GPU device
+ current_device = 0;
+
+ while (current_device < device_count)
+ {
+ cudaGetDeviceProperties(&deviceProp, current_device);
+
+ // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+ if (deviceProp.computeMode != cudaComputeModeProhibited)
+ {
+ if (deviceProp.major == 9999 && deviceProp.minor == 9999)
+ {
+ sm_per_multiproc = 1;
+ }
+ else
+ {
+ sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
+ }
+
+ unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
+
+ if (compute_perf > max_compute_perf)
+ {
+ // If we find GPU with SM major > 2, search only these
+ if (best_SM_arch > 2)
+ {
+ // If our device==dest_SM_arch, choose this, or else pass
+ if (deviceProp.major == best_SM_arch)
+ {
+ max_compute_perf = compute_perf;
+ max_perf_device = current_device;
+ }
+ }
+ else
+ {
+ max_compute_perf = compute_perf;
+ max_perf_device = current_device;
+ }
+ }
+ }
+
+ ++current_device;
+ }
+
+ return max_perf_device;
+}
+
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv)
+{
+ cudaDeviceProp deviceProp;
+ int devID = 0;
+
+ // If the command-line has a device number specified, use it
+ if (checkCmdLineFlag(argc, argv, "device"))
+ {
+ devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+ if (devID < 0)
+ {
+ printf("Invalid command line parameter\n ");
+ exit(EXIT_FAILURE);
+ }
+ else
+ {
+ devID = gpuDeviceInit(devID);
+
+ if (devID < 0)
+ {
+ printf("exiting...\n");
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+ else
+ {
+ // Otherwise pick the device with highest Gflops/s
+ devID = gpuGetMaxGflopsDeviceId();
+ checkCudaErrors(cudaSetDevice(devID));
+ checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+ printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
+ }
+
+ return devID;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version)
+{
+ cudaDeviceProp deviceProp;
+ deviceProp.major = 0;
+ deviceProp.minor = 0;
+ int dev;
+
+ checkCudaErrors(cudaGetDevice(&dev));
+ checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+ if ((deviceProp.major > major_version) ||
+ (deviceProp.major == major_version && deviceProp.minor >= minor_version))
+ {
+ printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
+ return true;
+ }
+ else
+ {
+ printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
+ return false;
+ }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+
+#endif
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h
new file mode 100644
index 0000000..8112ec9
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h
@@ -0,0 +1,517 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// Helper functions for CUDA Driver API error handling (make sure that CUDA_H is included in your projects)
+#ifndef HELPER_CUDA_DRVAPI_H
+#define HELPER_CUDA_DRVAPI_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <helper_string.h>
+#include <drvapi_error_string.h>
+
+#ifndef MAX
+#define MAX(a,b) (a > b ? a : b)
+#endif
+
+#ifndef HELPER_CUDA_H
+inline int ftoi(float value)
+{
+ return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
+}
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions
+
+// add a level of protection to the CUDA SDK samples, let's force samples to explicitly include CUDA.H
+#ifdef __cuda_cuda_h__
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#ifndef checkCudaErrors
+#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
+
+// These are the inline versions for all of the SDK helper functions
+inline void __checkCudaErrors(CUresult err, const char *file, const int line)
+{
+ if (CUDA_SUCCESS != err)
+ {
+ fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
+ err, getCudaDrvErrorString(err), file, line);
+ exit(EXIT_FAILURE);
+ }
+}
+#endif
+
+#ifdef getLastCudaDrvErrorMsg
+#undef getLastCudaDrvErrorMsg
+#endif
+
+#define getLastCudaDrvErrorMsg(msg) __getLastCudaDrvErrorMsg (msg, __FILE__, __LINE__)
+
+inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, const int line)
+{
+ CUresult err = cuCtxSynchronize();
+
+ if (CUDA_SUCCESS != err)
+ {
+ fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg);
+ fprintf(stderr, "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d \"%s\" in file <%s>, line %i.\n",
+ err, getCudaDrvErrorString(err), file, line);
+ exit(EXIT_FAILURE);
+ }
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+{
+ CUresult error_result = cuDeviceGetAttribute(attribute, device_attribute, device);
+
+ if (error_result != CUDA_SUCCESS)
+ {
+ printf("cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result));
+ exit(EXIT_SUCCESS);
+ }
+}
+#endif
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2CoresDRV(int major, int minor)
+{
+ // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+ typedef struct
+ {
+ int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+ int Cores;
+ } sSMtoCores;
+
+ sSMtoCores nGpuArchCoresPerSM[] =
+ {
+ { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+ { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+ { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+ { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+ { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+ { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+ { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+ { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+ { -1, -1 }
+ };
+
+ int index = 0;
+
+ while (nGpuArchCoresPerSM[index].SM != -1)
+ {
+ if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+ {
+ return nGpuArchCoresPerSM[index].Cores;
+ }
+
+ index++;
+ }
+
+ // If we don't find the values, we default use the previous one to run properly
+ printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+ return nGpuArchCoresPerSM[index-1].Cores;
+}
+// end of GPU Architecture definitions
+
+#ifdef __cuda_cuda_h__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
+{
+ int cuDevice = 0;
+ int deviceCount = 0;
+ CUresult err = cuInit(0);
+
+ if (CUDA_SUCCESS == err)
+ {
+ checkCudaErrors(cuDeviceGetCount(&deviceCount));
+ }
+
+ if (deviceCount == 0)
+ {
+ fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
+ exit(EXIT_FAILURE);
+ }
+
+ int dev = 0;
+ dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device=");
+
+ if (dev < 0)
+ {
+ dev = 0;
+ }
+
+ if (dev > deviceCount-1)
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
+ fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
+ fprintf(stderr, "\n");
+ return -dev;
+ }
+
+ checkCudaErrors(cuDeviceGet(&cuDevice, dev));
+ char name[100];
+ cuDeviceGetName(name, 100, cuDevice);
+
+ int computeMode;
+ getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+
+ if (computeMode == CU_COMPUTEMODE_PROHIBITED)
+ {
+ fprintf(stderr, "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no threads can use this CUDA Device.\n");
+ return -1;
+ }
+
+ if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false)
+ {
+ printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
+ }
+
+ return dev;
+}
+
+// This function returns the best GPU based on performance
+inline int gpuGetMaxGflopsDeviceIdDRV()
+{
+ CUdevice current_device = 0;
+ CUdevice max_perf_device = 0;
+ int device_count = 0;
+ int sm_per_multiproc = 0;
+ unsigned long long max_compute_perf = 0;
+ int best_SM_arch = 0;
+ int major = 0;
+ int minor = 0;
+ int multiProcessorCount;
+ int clockRate;
+ int devices_prohibited = 0;
+
+ cuInit(0);
+ checkCudaErrors(cuDeviceGetCount(&device_count));
+
+ if (device_count == 0)
+ {
+ fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // Find the best major SM Architecture GPU device
+ while (current_device < device_count)
+ {
+ checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+ if (major > 0 && major < 9999)
+ {
+ best_SM_arch = MAX(best_SM_arch, major);
+ }
+
+ current_device++;
+ }
+
+ // Find the best CUDA capable GPU device
+ current_device = 0;
+
+ while (current_device < device_count)
+ {
+ checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
+ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+ current_device));
+ checkCudaErrors(cuDeviceGetAttribute(&clockRate,
+ CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+ current_device));
+ checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+ int computeMode;
+ getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+
+ if (computeMode != CU_COMPUTEMODE_PROHIBITED)
+ {
+ if (major == 9999 && minor == 9999)
+ {
+ sm_per_multiproc = 1;
+ }
+ else
+ {
+ sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
+ }
+
+ unsigned long long compute_perf = (unsigned long long) (multiProcessorCount * sm_per_multiproc * clockRate);
+
+ if (compute_perf > max_compute_perf)
+ {
+ // If we find GPU with SM major > 2, search only these
+ if (best_SM_arch > 2)
+ {
+ // If our device==dest_SM_arch, choose this, or else pass
+ if (major == best_SM_arch)
+ {
+ max_compute_perf = compute_perf;
+ max_perf_device = current_device;
+ }
+ }
+ else
+ {
+ max_compute_perf = compute_perf;
+ max_perf_device = current_device;
+ }
+ }
+ }
+ else
+ {
+ devices_prohibited++;
+ }
+
+ ++current_device;
+ }
+
+ if (devices_prohibited == device_count)
+ {
+ fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode prohibited.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ return max_perf_device;
+}
+
+// This function returns the best Graphics GPU based on performance
+inline int gpuGetMaxGflopsGLDeviceIdDRV()
+{
+ CUdevice current_device = 0, max_perf_device = 0;
+ int device_count = 0, sm_per_multiproc = 0;
+ int max_compute_perf = 0, best_SM_arch = 0;
+ int major = 0, minor = 0, multiProcessorCount, clockRate;
+ int bTCC = 0;
+ int devices_prohibited = 0;
+ char deviceName[256];
+
+ cuInit(0);
+ checkCudaErrors(cuDeviceGetCount(&device_count));
+
+ if (device_count == 0)
+ {
+ fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: no devices supporting CUDA\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // Find the best major SM Architecture GPU device that are graphics devices
+ while (current_device < device_count)
+ {
+ checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
+ checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+#if CUDA_VERSION >= 3020
+ checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
+#else
+
+ // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
+ if (deviceName[0] == 'T')
+ {
+ bTCC = 1;
+ }
+
+#endif
+
+ int computeMode;
+ getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+
+ if (computeMode != CU_COMPUTEMODE_PROHIBITED)
+ {
+ if (!bTCC)
+ {
+ if (major > 0 && major < 9999)
+ {
+ best_SM_arch = MAX(best_SM_arch, major);
+ }
+ }
+ }
+ else
+ {
+ devices_prohibited++;
+ }
+
+ current_device++;
+ }
+
+ if (devices_prohibited == device_count)
+ {
+ fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: all devices have compute mode prohibited.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // Find the best CUDA capable GPU device
+ current_device = 0;
+
+ while (current_device < device_count)
+ {
+ checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
+ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+ current_device));
+ checkCudaErrors(cuDeviceGetAttribute(&clockRate,
+ CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+ current_device));
+ checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+#if CUDA_VERSION >= 3020
+ checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
+#else
+
+ // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
+ if (deviceName[0] == 'T')
+ {
+ bTCC = 1;
+ }
+
+#endif
+
+ int computeMode;
+ getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+
+ if (computeMode != CU_COMPUTEMODE_PROHIBITED)
+ {
+ if (major == 9999 && minor == 9999)
+ {
+ sm_per_multiproc = 1;
+ }
+ else
+ {
+ sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
+ }
+
+ // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
+ if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
+ {
+ int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
+
+ if (compute_perf > max_compute_perf)
+ {
+ // If we find GPU with SM major > 2, search only these
+ if (best_SM_arch > 2)
+ {
+ // If our device = dest_SM_arch, then we pick this one
+ if (major == best_SM_arch)
+ {
+ max_compute_perf = compute_perf;
+ max_perf_device = current_device;
+ }
+ }
+ else
+ {
+ max_compute_perf = compute_perf;
+ max_perf_device = current_device;
+ }
+ }
+ }
+ }
+
+ ++current_device;
+ }
+
+ return max_perf_device;
+}
+
+// General initialization call to pick the best CUDA Device
+inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
+{
+ CUdevice cuDevice;
+ int devID = 0;
+
+ // If the command-line has a device number specified, use it
+ if (checkCmdLineFlag(argc, (const char **)argv, "device"))
+ {
+ devID = gpuDeviceInitDRV(argc, argv);
+
+ if (devID < 0)
+ {
+ printf("exiting...\n");
+ exit(EXIT_SUCCESS);
+ }
+ }
+ else
+ {
+ // Otherwise pick the device with highest Gflops/s
+ char name[100];
+ devID = gpuGetMaxGflopsDeviceIdDRV();
+ checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+ cuDeviceGetName(name, 100, cuDevice);
+ printf("> Using CUDA Device [%d]: %s\n", devID, name);
+ }
+
+ cuDeviceGet(&cuDevice, devID);
+
+ return cuDevice;
+}
+
+// This function will pick the best CUDA device available with OpenGL interop
+inline CUdevice findCudaGLDeviceDRV(int argc, const char **argv)
+{
+ CUdevice cuDevice;
+ int devID = 0;
+
+ // If the command-line has a device number specified, use it
+ if (checkCmdLineFlag(argc, (const char **)argv, "device"))
+ {
+ devID = gpuDeviceInitDRV(argc, (const char **)argv);
+
+ if (devID < 0)
+ {
+ printf("no CUDA capable devices found, exiting...\n");
+ exit(EXIT_SUCCESS);
+ }
+ }
+ else
+ {
+ char name[100];
+ // Otherwise pick the device with highest Gflops/s
+ devID = gpuGetMaxGflopsGLDeviceIdDRV();
+ checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+ cuDeviceGetName(name, 100, cuDevice);
+ printf("> Using CUDA/GL Device [%d]: %s\n", devID, name);
+ }
+
+ return devID;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
+{
+ CUdevice cuDevice;
+ char name[256];
+ int major = 0, minor = 0;
+
+ checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+ checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+ checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID));
+
+ if ((major > major_version) ||
+ (major == major_version && minor >= minor_version))
+ {
+ printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
+ return true;
+ }
+ else
+ {
+ printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
+ return false;
+ }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+#endif
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h
new file mode 100644
index 0000000..3d2d943
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h
@@ -0,0 +1,165 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#ifndef HELPER_CUDA_GL_H
+#define HELPER_CUDA_GL_H
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// includes, graphics
+#if defined (__APPLE__) || defined(MACOSX)
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+#include <GL/gl.h>
+#include <GL/glu.h>
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset()
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+#ifdef __CUDA_GL_INTEROP_H__
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA OpenGL Helper functions
+
+inline int gpuGLDeviceInit(int ARGC, const char **ARGV)
+{
+ int deviceCount;
+ checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+
+ if (deviceCount == 0)
+ {
+ fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ int dev = 0;
+ dev = getCmdLineArgumentInt(ARGC, ARGV, "device=");
+
+ if (dev < 0)
+ {
+ dev = 0;
+ }
+
+ if (dev > deviceCount-1)
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
+ fprintf(stderr, ">> gpuGLDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
+ fprintf(stderr, "\n");
+ return -dev;
+ }
+
+ cudaDeviceProp deviceProp;
+ checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+ if (deviceProp.computeMode == cudaComputeModeProhibited)
+ {
+ fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+ return -1;
+ }
+
+ if (deviceProp.major < 1)
+ {
+ fprintf(stderr, "Error: device does not support CUDA.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (checkCmdLineFlag(ARGC, ARGV, "quiet") == false)
+ {
+ fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
+ }
+
+ checkCudaErrors(cudaGLSetGLDevice(dev));
+ return dev;
+}
+
+// This function will pick the best CUDA device available with OpenGL interop
+inline int findCudaGLDevice(int argc, const char **argv)
+{
+ int devID = 0;
+
+ // If the command-line has a device number specified, use it
+ if (checkCmdLineFlag(argc, (const char **)argv, "device"))
+ {
+ devID = gpuGLDeviceInit(argc, (const char **)argv);
+
+ if (devID < 0)
+ {
+ printf("no CUDA capable devices found, exiting...\n");
+ DEVICE_RESET
+ exit(EXIT_SUCCESS);
+ }
+ }
+ else
+ {
+ // Otherwise pick the device with highest Gflops/s
+ devID = gpuGetMaxGflopsDeviceId();
+ cudaGLSetGLDevice(devID);
+ }
+
+ return devID;
+}
+
+////////////////////////////////////////////////////////////////////////////
+//! Check for OpenGL error
+//! @return bool if no GL error has been encountered, otherwise 0
+//! @param file __FILE__ macro
+//! @param line __LINE__ macro
+//! @note The GL error is listed on stderr
+//! @note This function should be used via the CHECK_ERROR_GL() macro
+////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkCheckErrorGL(const char *file, const int line)
+{
+ bool ret_val = true;
+
+ // check for error
+ GLenum gl_error = glGetError();
+
+ if (gl_error != GL_NO_ERROR)
+ {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ char tmpStr[512];
+ // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
+ // when the user double clicks on the error line in the Output pane. Like any compile error.
+ sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, gluErrorString(gl_error));
+ fprintf(stderr, "%s", tmpStr);
+#endif
+ fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
+ fprintf(stderr, "%s\n", gluErrorString(gl_error));
+ ret_val = false;
+ }
+
+ return ret_val;
+}
+
+#define SDK_CHECK_ERROR_GL() \
+ if( false == sdkCheckErrorGL( __FILE__, __LINE__)) { \
+ DEVICE_RESET \
+ exit(EXIT_FAILURE); \
+ }
+#endif
+
+#endif
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_functions.h b/src/algorithms/tracking/libs/cudahelpers/helper_functions.h
new file mode 100644
index 0000000..11538ba
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_functions.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, image helpers, etc)
+#ifndef HELPER_FUNCTIONS_H
+#define HELPER_FUNCTIONS_H
+
+#ifdef WIN32
+#pragma warning(disable:4996)
+#endif
+
+// includes, project
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+// includes, timer, string parsing, image helpers
+#include <helper_timer.h> // helper functions for timers
+#include <helper_string.h> // helper functions for string parsing
+#include <helper_image.h> // helper functions for image compare, dump, data comparisons
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif // HELPER_FUNCTIONS_H
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_image.h b/src/algorithms/tracking/libs/cudahelpers/helper_image.h
new file mode 100644
index 0000000..4e8b25c
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_image.h
@@ -0,0 +1,1110 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef HELPER_IMAGE_H
+#define HELPER_IMAGE_H
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+
+#ifndef MIN
+#define MIN(a,b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a,b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include <helper_string.h>
+
+// namespace unnamed (internal)
+namespace
+{
+ //! size of PGM file header
+ const unsigned int PGMHeaderSize = 0x40;
+
+ // types
+
+ //! Data converter from unsigned char / unsigned byte to type T
+ template<class T>
+ struct ConverterFromUByte;
+
+ //! Data converter from unsigned char / unsigned byte
+ template<>
+ struct ConverterFromUByte<unsigned char>
+ {
+ //! Conversion operator
+ //! @return converted value
+ //! @param val value to convert
+ float operator()(const unsigned char &val)
+ {
+ return static_cast<unsigned char>(val);
+ }
+ };
+
+ //! Data converter from unsigned char / unsigned byte to float
+ template<>
+ struct ConverterFromUByte<float>
+ {
+ //! Conversion operator
+ //! @return converted value
+ //! @param val value to convert
+ float operator()(const unsigned char &val)
+ {
+ return static_cast<float>(val) / 255.0f;
+ }
+ };
+
+ //! Data converter from unsigned char / unsigned byte to type T
+ template<class T>
+ struct ConverterToUByte;
+
+ //! Data converter from unsigned char / unsigned byte to unsigned int
+ template<>
+ struct ConverterToUByte<unsigned char>
+ {
+ //! Conversion operator (essentially a passthru
+ //! @return converted value
+ //! @param val value to convert
+ unsigned char operator()(const unsigned char &val)
+ {
+ return val;
+ }
+ };
+
+ //! Data converter from unsigned char / unsigned byte to unsigned int
+ template<>
+ struct ConverterToUByte<float>
+ {
+ //! Conversion operator
+ //! @return converted value
+ //! @param val value to convert
+ unsigned char operator()(const float &val)
+ {
+ return static_cast<unsigned char>(val * 255.0f);
+ }
+ };
+}
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool
+__loadPPM(const char *file, unsigned char **data,
+ unsigned int *w, unsigned int *h, unsigned int *channels)
+{
+ FILE *fp = NULL;
+
+ if (FOPEN_FAIL(FOPEN(fp, file, "rb")))
+ {
+ std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+ return false;
+ }
+
+ // check header
+ char header[PGMHeaderSize];
+
+ if (fgets(header, PGMHeaderSize, fp) == NULL)
+ {
+ std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+ return false;
+ }
+
+ if (strncmp(header, "P5", 2) == 0)
+ {
+ *channels = 1;
+ }
+ else if (strncmp(header, "P6", 2) == 0)
+ {
+ *channels = 3;
+ }
+ else
+ {
+ std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+ *channels = 0;
+ return false;
+ }
+
+ // parse header, read maxval, width and height
+ unsigned int width = 0;
+ unsigned int height = 0;
+ unsigned int maxval = 0;
+ unsigned int i = 0;
+
+ while (i < 3)
+ {
+ if (fgets(header, PGMHeaderSize, fp) == NULL)
+ {
+ std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+ return false;
+ }
+
+ if (header[0] == '#')
+ {
+ continue;
+ }
+
+ if (i == 0)
+ {
+ i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+ }
+ else if (i == 1)
+ {
+ i += SSCANF(header, "%u %u", &height, &maxval);
+ }
+ else if (i == 2)
+ {
+ i += SSCANF(header, "%u", &maxval);
+ }
+ }
+
+ // check if given handle for the data is initialized
+ if (NULL != *data)
+ {
+ if (*w != width || *h != height)
+ {
+ std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+ }
+ }
+ else
+ {
+ *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height **channels);
+ *w = width;
+ *h = height;
+ }
+
+ // read and close file
+ if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0)
+ {
+ std::cerr << "__LoadPPM() read data returned error." << std::endl;
+ }
+
+ fclose(fp);
+
+ return true;
+}
+
+template <class T>
+inline bool
+sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h)
+{
+ unsigned char *idata = NULL;
+ unsigned int channels;
+
+ if (true != __loadPPM(file, &idata, w, h, &channels))
+ {
+ return false;
+ }
+
+ unsigned int size = *w **h * channels;
+
+ // initialize mem if necessary
+ // the correct size is checked / set in loadPGMc()
+ if (NULL == *data)
+ {
+ *data = (T *) malloc(sizeof(T) * size);
+ }
+
+ // copy and cast data
+ std::transform(idata, idata + size, *data, ConverterFromUByte<T>());
+
+ free(idata);
+
+ return true;
+}
+
+template <class T>
+inline bool
+sdkLoadPPM4(const char *file, T **data,
+ unsigned int *w,unsigned int *h)
+{
+ unsigned char *idata = 0;
+ unsigned int channels;
+
+ if (__loadPPM(file, &idata, w, h, &channels))
+ {
+ // pad 4th component
+ int size = *w **h;
+ // keep the original pointer
+ unsigned char *idata_orig = idata;
+ *data = (T *) malloc(sizeof(T) * size * 4);
+ unsigned char *ptr = *data;
+
+ for (int i=0; i<size; i++)
+ {
+ *ptr++ = *idata++;
+ *ptr++ = *idata++;
+ *ptr++ = *idata++;
+ *ptr++ = 0;
+ }
+
+ free(idata_orig);
+ return true;
+ }
+ else
+ {
+ free(idata);
+ return false;
+ }
+}
+
+inline bool
+__savePPM(const char *file, unsigned char *data,
+ unsigned int w, unsigned int h, unsigned int channels)
+{
+ assert(NULL != data);
+ assert(w > 0);
+ assert(h > 0);
+
+ std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+ if (fh.bad())
+ {
+ std::cerr << "__savePPM() : Opening file failed." << std::endl;
+ return false;
+ }
+
+ if (channels == 1)
+ {
+ fh << "P5\n";
+ }
+ else if (channels == 3)
+ {
+ fh << "P6\n";
+ }
+ else
+ {
+ std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+ return false;
+ }
+
+ fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+ for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)
+ {
+ fh << data[i];
+ }
+
+ fh.flush();
+
+ if (fh.bad())
+ {
+ std::cerr << "__savePPM() : Writing data failed." << std::endl;
+ return false;
+ }
+
+ fh.close();
+
+ return true;
+}
+
+template<class T>
+inline bool
+sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h)
+{
+ unsigned int size = w * h;
+ unsigned char *idata =
+ (unsigned char *) malloc(sizeof(unsigned char) * size);
+
+ std::transform(data, data + size, idata, ConverterToUByte<T>());
+
+ // write file
+ bool result = __savePPM(file, idata, w, h, 1);
+
+ // cleanup
+ free(idata);
+
+ return result;
+}
+
+inline bool
+sdkSavePPM4ub(const char *file, unsigned char *data,
+ unsigned int w, unsigned int h)
+{
+ // strip 4th component
+ int size = w * h;
+ unsigned char *ndata = (unsigned char *) malloc(sizeof(unsigned char) * size*3);
+ unsigned char *ptr = ndata;
+
+ for (int i=0; i<size; i++)
+ {
+ *ptr++ = *data++;
+ *ptr++ = *data++;
+ *ptr++ = *data++;
+ data++;
+ }
+
+ bool result = __savePPM(file, ndata, w, h, 3);
+ free(ndata);
+ return result;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data uninitialized pointer, returned initialized and pointing to
+//! the data read
+//! @param len number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template<class T>
+inline bool
+sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
+{
+ // check input arguments
+ assert(NULL != filename);
+ assert(NULL != len);
+
+ // intermediate storage for the data read
+ std::vector<T> data_read;
+
+ // open file for reading
+ FILE *fh = NULL;
+
+ // check if filestream is valid
+ if (FOPEN_FAIL(FOPEN(fh, filename, "r")))
+ {
+ printf("Unable to open input file: %s\n", filename);
+ return false;
+ }
+
+ // read all data elements
+ T token;
+
+ while (!feof(fh))
+ {
+ fscanf(fh, "%f", &token);
+ data_read.push_back(token);
+ }
+
+ // the last element is read twice
+ data_read.pop_back();
+ fclose(fh);
+
+ // check if the given handle is already initialized
+ if (NULL != *data)
+ {
+ if (*len != data_read.size())
+ {
+ std::cerr << "sdkReadFile() : Initialized memory given but "
+ << "size mismatch with signal read "
+ << "(data read / data init = " << (unsigned int)data_read.size()
+ << " / " << *len << ")" << std::endl;
+
+ return false;
+ }
+ }
+ else
+ {
+ // allocate storage for the data read
+ *data = (T *) malloc(sizeof(T) * data_read.size());
+ // store signal size
+ *len = static_cast<unsigned int>(data_read.size());
+ }
+
+ // copy data
+ memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data uninitialized pointer, returned initialized and pointing to
+//! the data read
+//! @param len number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template<class T>
+inline bool
+sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned int block_num, unsigned int block_size, bool verbose)
+{
+ // check input arguments
+ assert(NULL != filename);
+ assert(NULL != len);
+
+ // open file for reading
+ FILE *fh = fopen(filename, "rb");
+
+ if (fh == NULL && verbose)
+ {
+ std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+ return false;
+ }
+
+ // check if the given handle is already initialized
+ // allocate storage for the data read
+ data[block_num] = (T *) malloc(block_size);
+
+ // read all data elements
+ fseek(fh, block_num * block_size, SEEK_SET);
+ *len = fread(data[block_num], sizeof(T), block_size/sizeof(T), fh);
+
+ fclose(fh);
+
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data data to write
+//! @param len number of data elements in data, -1 on error
+//! @param epsilon epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template<class T, class S>
+inline bool
+sdkWriteFile(const char *filename, const T *data, unsigned int len,
+ const S epsilon, bool verbose, bool append = false)
+{
+ assert(NULL != filename);
+ assert(NULL != data);
+
+ // open file for writing
+ // if (append) {
+ std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+ if (verbose)
+ {
+ std::cerr << "sdkWriteFile() : Open file " << filename << " for write/append." << std::endl;
+ }
+
+ /* } else {
+ std::fstream fh(filename, std::fstream::out);
+ if (verbose) {
+ std::cerr << "sdkWriteFile() : Open file " << filename << " for write." << std::endl;
+ }
+ }
+ */
+
+ // check if filestream is valid
+ if (! fh.good())
+ {
+ if (verbose)
+ {
+ std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+ }
+
+ return false;
+ }
+
+ // first write epsilon
+ fh << "# " << epsilon << "\n";
+
+ // write data
+ for (unsigned int i = 0; (i < len) && (fh.good()); ++i)
+ {
+ fh << data[i] << ' ';
+ }
+
+ // Check if writing succeeded
+ if (! fh.good())
+ {
+ if (verbose)
+ {
+ std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+ }
+
+ return false;
+ }
+
+ // file ends with nl
+ fh << std::endl;
+
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return true if \a reference and \a data are identical, otherwise false
+//! @param reference timer_interface to the reference data / gold image
+//! @param data handle to the computed data
+//! @param len number of elements in reference and data
+//! @param epsilon epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template<class T, class S>
+inline bool
+compareData(const T *reference, const T *data, const unsigned int len,
+ const S epsilon, const float threshold)
+{
+ assert(epsilon >= 0);
+
+ bool result = true;
+ unsigned int error_count = 0;
+
+ for (unsigned int i = 0; i < len; ++i)
+ {
+ float diff = (float)reference[i] - (float)data[i];
+ bool comp = (diff <= epsilon) && (diff >= -epsilon);
+ result &= comp;
+
+ error_count += !comp;
+
+#if 0
+
+ if (! comp)
+ {
+ std::cerr << "ERROR, i = " << i << ",\t "
+ << reference[i] << " / "
+ << data[i]
+ << " (reference / data)\n";
+ }
+
+#endif
+ }
+
+ if (threshold == 0.0f)
+ {
+ return (result) ? true : false;
+ }
+ else
+ {
+ if (error_count)
+ {
+ printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
+ }
+
+ return (len*threshold > error_count) ? true : false;
+ }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return true if \a reference and \a data are identical, otherwise false
+//! @param reference handle to the reference data / gold image
+//! @param data handle to the computed data
+//! @param len number of elements in reference and data
+//! @param epsilon epsilon to use for the comparison
+//! @param epsilon threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template<class T, class S>
+inline bool
+compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned int len,
+ const S epsilon, const float threshold)
+{
+ assert(epsilon >= 0);
+
+ // If we set epsilon to be 0, let's set a minimum threshold
+ float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+ int error_count = 0;
+ bool result = true;
+
+ for (unsigned int i = 0; i < len; ++i)
+ {
+ float diff = fabs((float)reference[i] - (float)data[i]);
+ bool comp = (diff < max_error);
+ result &= comp;
+
+ if (! comp)
+ {
+ error_count++;
+#if 0
+
+ if (error_count < 50)
+ {
+ printf("\n ERROR(epsilon=%4.3f), i=%d, (ref)0x%02x / (data)0x%02x / (diff)%d\n",
+ max_error, i,
+ *(unsigned int *)&reference[i],
+ *(unsigned int *)&data[i],
+ (unsigned int)diff);
+ }
+
+#endif
+ }
+ }
+
+ if (threshold == 0.0f)
+ {
+ if (error_count)
+ {
+ printf("total # of errors = %d\n", error_count);
+ }
+
+ return (error_count == 0) ? true : false;
+ }
+ else
+ {
+ if (error_count)
+ {
+ printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
+ }
+
+ return ((len*threshold > error_count) ? true : false);
+ }
+}
+
+inline
+void sdkDumpBin(void *data, unsigned int bytes, const char *filename)
+{
+ printf("sdkDumpBin: <%s>\n", filename);
+ FILE *fp;
+ FOPEN(fp, filename, "wb");
+ fwrite(data, bytes, 1, fp);
+ fflush(fp);
+ fclose(fp);
+}
+
+inline
+bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
+{
+ unsigned int *src_buffer, *ref_buffer;
+ FILE *src_fp = NULL, *ref_fp = NULL;
+
+ unsigned long error_count = 0;
+ size_t fsize = 0;
+
+ if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
+ {
+ printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n", src_file);
+ error_count++;
+ }
+
+ char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+ if (ref_file_path == NULL)
+ {
+ printf("compareBin2Bin <unsigned int> unable to find <%s> in <%s>\n", ref_file, exec_path);
+ printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file);
+ printf("Aborting comparison!\n");
+ printf(" FAILED\n");
+ error_count++;
+
+ if (src_fp)
+ {
+ fclose(src_fp);
+ }
+
+ if (ref_fp)
+ {
+ fclose(ref_fp);
+ }
+ }
+ else
+ {
+ if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
+ {
+ printf("compareBin2Bin <unsigned int> unable to open ref_file: %s\n", ref_file_path);
+ error_count++;
+ }
+
+ if (src_fp && ref_fp)
+ {
+ src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
+ ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
+
+ fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+ fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+ printf("> compareBin2Bin <unsigned int> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
+ printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
+ printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
+
+ if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
+ {
+ error_count++;
+ }
+
+ fclose(src_fp);
+ fclose(ref_fp);
+
+ free(src_buffer);
+ free(ref_buffer);
+ }
+ else
+ {
+ if (src_fp)
+ {
+ fclose(src_fp);
+ }
+
+ if (ref_fp)
+ {
+ fclose(ref_fp);
+ }
+ }
+ }
+
+ if (error_count == 0)
+ {
+ printf(" OK\n");
+ }
+ else
+ {
+ printf(" FAILURE: %d errors...\n", (unsigned int)error_count);
+ }
+
+ return (error_count == 0); // returns true if all pixels pass
+}
+
+inline
+bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
+{
+ float *src_buffer, *ref_buffer;
+ FILE *src_fp = NULL, *ref_fp = NULL;
+ size_t fsize = 0;
+
+ unsigned long error_count = 0;
+
+ if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
+ {
+ printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+ error_count = 1;
+ }
+
+ char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+ if (ref_file_path == NULL)
+ {
+ printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file, exec_path);
+ printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", exec_path);
+ printf("Aborting comparison!\n");
+ printf(" FAILED\n");
+ error_count++;
+
+ if (src_fp)
+ {
+ fclose(src_fp);
+ }
+
+ if (ref_fp)
+ {
+ fclose(ref_fp);
+ }
+ }
+ else
+ {
+ if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
+ {
+ printf("compareBin2Bin <float> unable to open ref_file: %s\n", ref_file_path);
+ error_count = 1;
+ }
+
+ if (src_fp && ref_fp)
+ {
+ src_buffer = (float *)malloc(nelements*sizeof(float));
+ ref_buffer = (float *)malloc(nelements*sizeof(float));
+
+ fsize = fread(src_buffer, nelements, sizeof(float), src_fp);
+ fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp);
+
+ printf("> compareBin2Bin <float> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
+ printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
+ printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
+
+ if (!compareDataAsFloatThreshold<float, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
+ {
+ error_count++;
+ }
+
+ fclose(src_fp);
+ fclose(ref_fp);
+
+ free(src_buffer);
+ free(ref_buffer);
+ }
+ else
+ {
+ if (src_fp)
+ {
+ fclose(src_fp);
+ }
+
+ if (ref_fp)
+ {
+ fclose(ref_fp);
+ }
+ }
+ }
+
+ if (error_count == 0)
+ {
+ printf(" OK\n");
+ }
+ else
+ {
+ printf(" FAILURE: %d errors...\n", (unsigned int)error_count);
+ }
+
+ return (error_count == 0); // returns true if all pixels pass
+}
+
+inline bool
+sdkCompareL2fe(const float *reference, const float *data,
+ const unsigned int len, const float epsilon)
+{
+ assert(epsilon >= 0);
+
+ float error = 0;
+ float ref = 0;
+
+ for (unsigned int i = 0; i < len; ++i)
+ {
+
+ float diff = reference[i] - data[i];
+ error += diff * diff;
+ ref += reference[i] * reference[i];
+ }
+
+ float normRef = sqrtf(ref);
+
+ if (fabs(ref) < 1e-7)
+ {
+#ifdef _DEBUG
+ std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+ return false;
+ }
+
+ float normError = sqrtf(error);
+ error = normError / normRef;
+ bool result = error < epsilon;
+#ifdef _DEBUG
+
+ if (! result)
+ {
+ std::cerr << "ERROR, l2-norm error "
+ << error << " is greater than epsilon " << epsilon << "\n";
+ }
+
+#endif
+
+ return result;
+}
+
+inline bool
+sdkLoadPPMub(const char *file, unsigned char **data,
+ unsigned int *w,unsigned int *h)
+{
+ unsigned int channels;
+ return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool
+sdkLoadPPM4ub(const char *file, unsigned char **data,
+ unsigned int *w, unsigned int *h)
+{
+ unsigned char *idata = 0;
+ unsigned int channels;
+
+ if (__loadPPM(file, &idata, w, h, &channels))
+ {
+ // pad 4th component
+ int size = *w **h;
+ // keep the original pointer
+ unsigned char *idata_orig = idata;
+ *data = (unsigned char *) malloc(sizeof(unsigned char) * size * 4);
+ unsigned char *ptr = *data;
+
+ for (int i=0; i<size; i++)
+ {
+ *ptr++ = *idata++;
+ *ptr++ = *idata++;
+ *ptr++ = *idata++;
+ *ptr++ = 0;
+ }
+
+ free(idata_orig);
+ return true;
+ }
+ else
+ {
+ free(idata);
+ return false;
+ }
+}
+
+
+inline bool
+sdkComparePPM(const char *src_file, const char *ref_file,
+ const float epsilon, const float threshold, bool verboseErrors)
+{
+ unsigned char *src_data, *ref_data;
+ unsigned long error_count = 0;
+ unsigned int ref_width, ref_height;
+ unsigned int src_width, src_height;
+
+ if (src_file == NULL || ref_file == NULL)
+ {
+ if (verboseErrors)
+ {
+ std::cerr << "PPMvsPPM: src_file or ref_file is NULL. Aborting comparison\n";
+ }
+
+ return false;
+ }
+
+ if (verboseErrors)
+ {
+ std::cerr << "> Compare (a)rendered: <" << src_file << ">\n";
+ std::cerr << "> (b)reference: <" << ref_file << ">\n";
+ }
+
+
+ if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true)
+ {
+ if (verboseErrors)
+ {
+ std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n";
+ }
+
+ return false;
+ }
+
+ if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true)
+ {
+ std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n";
+ return false;
+ }
+
+ if (src_height != ref_height || src_width != ref_width)
+ {
+ if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width <<
+ "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
+ }
+
+ if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width <<
+ "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
+
+ if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false)
+ {
+ error_count=1;
+ }
+
+ if (error_count == 0)
+ {
+ if (verboseErrors)
+ {
+ std::cerr << " OK\n\n";
+ }
+ }
+ else
+ {
+ if (verboseErrors)
+ {
+ std::cerr << " FAILURE! "<<error_count<<" errors...\n\n";
+ }
+ }
+
+ return (error_count == 0)? true : false; // returns true if all pixels pass
+}
+
+inline bool
+sdkComparePGM(const char *src_file, const char *ref_file,
+ const float epsilon, const float threshold, bool verboseErrors)
+{
+ unsigned char *src_data = 0, *ref_data = 0;
+ unsigned long error_count = 0;
+ unsigned int ref_width, ref_height;
+ unsigned int src_width, src_height;
+
+ if (src_file == NULL || ref_file == NULL)
+ {
+ if (verboseErrors)
+ {
+ std::cerr << "PGMvsPGM: src_file or ref_file is NULL. Aborting comparison\n";
+ }
+
+ return false;
+ }
+
+ if (verboseErrors)
+ {
+ std::cerr << "> Compare (a)rendered: <" << src_file << ">\n";
+ std::cerr << "> (b)reference: <" << ref_file << ">\n";
+ }
+
+
+ if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true)
+ {
+ if (verboseErrors)
+ {
+ std::cerr << "PGMvsPGM: unable to load ref image file: "<< ref_file << "\n";
+ }
+
+ return false;
+ }
+
+ if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true)
+ {
+ std::cerr << "PGMvsPGM: unable to load src image file: " << src_file << "\n";
+ return false;
+ }
+
+ if (src_height != ref_height || src_width != ref_width)
+ {
+ if (verboseErrors) std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width <<
+ "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
+ }
+
+ if (verboseErrors) std::cerr << "PGMvsPGM: comparing images size (" << src_width <<
+ "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
+
+ if (compareData(ref_data, src_data, src_width*src_height, epsilon, threshold) == false)
+ {
+ error_count=1;
+ }
+
+ if (error_count == 0)
+ {
+ if (verboseErrors)
+ {
+ std::cerr << " OK\n\n";
+ }
+ }
+ else
+ {
+ if (verboseErrors)
+ {
+ std::cerr << " FAILURE! "<<error_count<<" errors...\n\n";
+ }
+ }
+
+ return (error_count == 0)? true : false; // returns true if all pixels pass
+}
+
+#endif // HELPER_IMAGE_H
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_math.h b/src/algorithms/tracking/libs/cudahelpers/helper_math.h
new file mode 100644
index 0000000..c9c07c3
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_math.h
@@ -0,0 +1,1453 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+ * This file implements common mathematical operations on vector types
+ * (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ * The syntax is modeled on the Cg standard library.
+ *
+ * This is part of the Helper library includes
+ *
+ * Thanks to Linh Hah for additions and fixes.
+ */
+
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+
+#include "cuda_runtime.h"
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline float fminf(float a, float b)
+{
+ return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+ return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+ return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+ return a < b ? a : b;
+}
+
+inline float rsqrtf(float x)
+{
+ return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 make_float2(float s)
+{
+ return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+ return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+ return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+ return make_float2(float(a.x), float(a.y));
+}
+
+inline __host__ __device__ int2 make_int2(int s)
+{
+ return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+ return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+ return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+ return make_int2(int(a.x), int(a.y));
+}
+
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+ return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+ return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+ return make_uint2(uint(a.x), uint(a.y));
+}
+
+inline __host__ __device__ float3 make_float3(float s)
+{
+ return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+ return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+ return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+ return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+ return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+ return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+inline __host__ __device__ int3 make_int3(int s)
+{
+ return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+ return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+ return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+ return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+ return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+ return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+ return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+ return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+ return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+ return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+inline __host__ __device__ float4 make_float4(float s)
+{
+ return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+ return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+ return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+ return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+ return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+inline __host__ __device__ int4 make_int4(int s)
+{
+ return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+ return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+ return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+ return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+ return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+
+
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+ return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+ return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+ return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+ return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+ return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+ return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+ return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+ return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+ return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+ return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+ return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+ return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+ return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+ a.x += b;
+ a.y += b;
+}
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+ return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+ return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+ return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+ a.x += b;
+ a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+ return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+ return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+ return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+ a.x += b;
+ a.y += b;
+}
+
+
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+ return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+ a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+ return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+ a.x += b;
+ a.y += b;
+ a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+ return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+ a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+ return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+ a.x += b;
+ a.y += b;
+ a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+ return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+ a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+ return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+ a.x += b;
+ a.y += b;
+ a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+ return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+ return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+ return make_float3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+ return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+ a.z += b.z;
+ a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+ return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+ return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+ a.x += b;
+ a.y += b;
+ a.z += b;
+ a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+ return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+ a.z += b.z;
+ a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+ return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+ return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+ a.x += b;
+ a.y += b;
+ a.z += b;
+ a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+ return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+ a.x += b.x;
+ a.y += b.y;
+ a.z += b.z;
+ a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+ return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+ return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+ a.x += b;
+ a.y += b;
+ a.z += b;
+ a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+ return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+ return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+ return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+ a.x -= b;
+ a.y -= b;
+}
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+ return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+ return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+ return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+ a.x -= b;
+ a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+ return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+ return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+ return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+ a.x -= b;
+ a.y -= b;
+}
+
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+ return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+ a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+ return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+ return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+ a.x -= b;
+ a.y -= b;
+ a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+ return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+ a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+ return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+ return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+ a.x -= b;
+ a.y -= b;
+ a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+ return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+ a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+ return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+ return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+ a.x -= b;
+ a.y -= b;
+ a.z -= b;
+}
+
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+ return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+ a.z -= b.z;
+ a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+ return make_float4(a.x - b, a.y - b, a.z - b, a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+ a.x -= b;
+ a.y -= b;
+ a.z -= b;
+ a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+ return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+ a.z -= b.z;
+ a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+ return make_int4(a.x - b, a.y - b, a.z - b, a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+ return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+ a.x -= b;
+ a.y -= b;
+ a.z -= b;
+ a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+ return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+ a.x -= b.x;
+ a.y -= b.y;
+ a.z -= b.z;
+ a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+ return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+ return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+ a.x -= b;
+ a.y -= b;
+ a.z -= b;
+ a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+ return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+ return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+ return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+ a.x *= b;
+ a.y *= b;
+}
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+ return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+ return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+ return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+ a.x *= b;
+ a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+ return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+ return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+ return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+ a.x *= b;
+ a.y *= b;
+}
+
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+ return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+ a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+ return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+ return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+ a.x *= b;
+ a.y *= b;
+ a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+ return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+ a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+ return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+ return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+ a.x *= b;
+ a.y *= b;
+ a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+ return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+ a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+ return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+ return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+ a.x *= b;
+ a.y *= b;
+ a.z *= b;
+}
+
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+ return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+ a.z *= b.z;
+ a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+ return make_float4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+ return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+ a.x *= b;
+ a.y *= b;
+ a.z *= b;
+ a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+ return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+ a.z *= b.z;
+ a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+ return make_int4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+ return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+ a.x *= b;
+ a.y *= b;
+ a.z *= b;
+ a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+ return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+ a.x *= b.x;
+ a.y *= b.y;
+ a.z *= b.z;
+ a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+ return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+ return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+ a.x *= b;
+ a.y *= b;
+ a.z *= b;
+ a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+ return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+ a.x /= b.x;
+ a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+ return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+ a.x /= b;
+ a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+ return make_float2(b / a.x, b / a.y);
+}
+
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+ return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+ a.x /= b.x;
+ a.y /= b.y;
+ a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+ return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+ a.x /= b;
+ a.y /= b;
+ a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+ return make_float3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+ return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+ a.x /= b.x;
+ a.y /= b.y;
+ a.z /= b.z;
+ a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+ return make_float4(a.x / b, a.y / b, a.z / b, a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+ a.x /= b;
+ a.y /= b;
+ a.z /= b;
+ a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+ return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+ return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+ return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+ return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+ return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+ return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+ return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+ return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+ return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+ return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+ return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+ return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+ return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+ return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+ return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+ return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+ return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+ return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+ return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+ return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+ return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+ return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+ return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+ return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+ return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+ return max(a, min(f, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+ return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+ return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+ return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+ return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+ return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+ return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+ return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+ return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+ return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+ return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+ return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+ return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+ return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+ return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+ return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+ return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+ return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+ return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+ return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+ return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+ return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float length(float2 v)
+{
+ return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+ return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+ return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 normalize(float2 v)
+{
+ float invLen = rsqrtf(dot(v, v));
+ return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+ float invLen = rsqrtf(dot(v, v));
+ return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+ float invLen = rsqrtf(dot(v, v));
+ return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 floorf(float2 v)
+{
+ return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+ return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+ return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float fracf(float v)
+{
+ return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+ return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+ return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+ return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+ return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+ return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+ return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fabs(float2 v)
+{
+ return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+ return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+ return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+ return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+ return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+ return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+ return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+ return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+ float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+ return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+ float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+ return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+ float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+ return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+ float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+ return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+
+#endif
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_string.h b/src/algorithms/tracking/libs/cudahelpers/helper_string.h
new file mode 100644
index 0000000..c734314
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_string.h
@@ -0,0 +1,516 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef STRING_HELPER_H
+#define STRING_HELPER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string)
+{
+ int string_start = 0;
+
+ while (string[string_start] == delimiter)
+ {
+ string_start++;
+ }
+
+ if (string_start >= (int)strlen(string)-1)
+ {
+ return 0;
+ }
+
+ return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension)
+{
+ int string_length = (int)strlen(filename);
+
+ while (filename[string_length--] != '.')
+ {
+ if (string_length == 0)
+ break;
+ }
+
+ if (string_length > 0) string_length += 2;
+
+ if (string_length == 0)
+ *extension = NULL;
+ else
+ *extension = &filename[string_length];
+
+ return string_length;
+}
+
+
+inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
+{
+ bool bFound = false;
+
+ if (argc >= 1)
+ {
+ for (int i=1; i < argc; i++)
+ {
+ int string_start = stringRemoveDelimiter('-', argv[i]);
+ const char *string_argv = &argv[i][string_start];
+
+ const char *equal_pos = strchr(string_argv, '=');
+ int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+ int length = (int)strlen(string_ref);
+
+ if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
+ {
+ bFound = true;
+ continue;
+ }
+ }
+ }
+
+ return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
+{
+ bool bFound = false;
+
+ if (argc >= 1)
+ {
+ for (int i=1; i < argc; i++)
+ {
+ int string_start = stringRemoveDelimiter('-', argv[i]);
+ const char *string_argv = &argv[i][string_start];
+ int length = (int)strlen(string_ref);
+
+ if (!STRNCASECMP(string_argv, string_ref, length))
+ {
+ if (length+1 <= (int)strlen(string_argv))
+ {
+ int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+ *value = (T)atoi(&string_argv[length + auto_inc]);
+ }
+
+ bFound = true;
+ i=argc;
+ }
+ }
+ }
+
+ return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
+{
+ bool bFound = false;
+ int value = -1;
+
+ if (argc >= 1)
+ {
+ for (int i=1; i < argc; i++)
+ {
+ int string_start = stringRemoveDelimiter('-', argv[i]);
+ const char *string_argv = &argv[i][string_start];
+ int length = (int)strlen(string_ref);
+
+ if (!STRNCASECMP(string_argv, string_ref, length))
+ {
+ if (length+1 <= (int)strlen(string_argv))
+ {
+ int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+ value = atoi(&string_argv[length + auto_inc]);
+ }
+ else
+ {
+ value = 0;
+ }
+
+ bFound = true;
+ continue;
+ }
+ }
+ }
+
+ if (bFound)
+ {
+ return value;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
+{
+ bool bFound = false;
+ float value = -1;
+
+ if (argc >= 1)
+ {
+ for (int i=1; i < argc; i++)
+ {
+ int string_start = stringRemoveDelimiter('-', argv[i]);
+ const char *string_argv = &argv[i][string_start];
+ int length = (int)strlen(string_ref);
+
+ if (!STRNCASECMP(string_argv, string_ref, length))
+ {
+ if (length+1 <= (int)strlen(string_argv))
+ {
+ int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+ value = (float)atof(&string_argv[length + auto_inc]);
+ }
+ else
+ {
+ value = 0.f;
+ }
+
+ bFound = true;
+ continue;
+ }
+ }
+ }
+
+ if (bFound)
+ {
+ return value;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+ const char *string_ref, char **string_retval)
+{
+ bool bFound = false;
+
+ if (argc >= 1)
+ {
+ for (int i=1; i < argc; i++)
+ {
+ int string_start = stringRemoveDelimiter('-', argv[i]);
+ char *string_argv = (char *)&argv[i][string_start];
+ int length = (int)strlen(string_ref);
+
+ if (!STRNCASECMP(string_argv, string_ref, length))
+ {
+ *string_retval = &string_argv[length+1];
+ bFound = true;
+ continue;
+ }
+ }
+ }
+
+ if (!bFound)
+ {
+ *string_retval = NULL;
+ }
+
+ return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename name of the file
+//! @param executable_path optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename, const char *executable_path)
+{
+ // <executable_name> defines a variable that is replaced with the name of the executable
+
+ // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
+ // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
+ const char *searchPath[] =
+ {
+ "./", // same dir
+ "./common/", // "/common/" subdir
+ "./common/data/", // "/common/data/" subdir
+ "./data/", // "/data/" subdir
+ "./src/", // "/src/" subdir
+ "./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
+ "./inc/", // "/inc/" subdir
+ "./0_Simple/", // "/0_Simple/" subdir
+ "./1_Utilities/", // "/1_Utilities/" subdir
+ "./2_Graphics/", // "/2_Graphics/" subdir
+ "./3_Imaging/", // "/3_Imaging/" subdir
+ "./4_Finance/", // "/4_Finance/" subdir
+ "./5_Simulations/", // "/5_Simulations/" subdir
+ "./6_Advanced/", // "/6_Advanced/" subdir
+ "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
+ "./8_Android/", // "/8_Android/" subdir
+ "./samples/", // "/samples/" subdir
+
+ "../", // up 1 in tree
+ "../common/", // up 1 in tree, "/common/" subdir
+ "../common/data/", // up 1 in tree, "/common/data/" subdir
+ "../data/", // up 1 in tree, "/data/" subdir
+ "../src/", // up 1 in tree, "/src/" subdir
+ "../inc/", // up 1 in tree, "/inc/" subdir
+
+ "../0_Simple/<executable_name>/data/", // up 1 in tree, "/0_Simple/<executable_name>/" subdir
+ "../1_Utilities/<executable_name>/data/", // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
+ "../2_Graphics/<executable_name>/data/", // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
+ "../3_Imaging/<executable_name>/data/", // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
+ "../4_Finance/<executable_name>/data/", // up 1 in tree, "/4_Finance/<executable_name>/" subdir
+ "../5_Simulations/<executable_name>/data/", // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
+ "../6_Advanced/<executable_name>/data/", // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
+ "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+ "../8_Android/<executable_name>/data/", // up 1 in tree, "/8_Android/<executable_name>/" subdir
+ "../samples/<executable_name>/data/", // up 1 in tree, "/samples/<executable_name>/" subdir
+ "../../", // up 2 in tree
+ "../../common/", // up 2 in tree, "/common/" subdir
+ "../../common/data/", // up 2 in tree, "/common/data/" subdir
+ "../../data/", // up 2 in tree, "/data/" subdir
+ "../../src/", // up 2 in tree, "/src/" subdir
+ "../../inc/", // up 2 in tree, "/inc/" subdir
+ "../../sandbox/<executable_name>/data/", // up 2 in tree, "/sandbox/<executable_name>/" subdir
+ "../../0_Simple/<executable_name>/data/", // up 2 in tree, "/0_Simple/<executable_name>/" subdir
+ "../../1_Utilities/<executable_name>/data/", // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
+ "../../2_Graphics/<executable_name>/data/", // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
+ "../../3_Imaging/<executable_name>/data/", // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
+ "../../4_Finance/<executable_name>/data/", // up 2 in tree, "/4_Finance/<executable_name>/" subdir
+ "../../5_Simulations/<executable_name>/data/", // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
+ "../../6_Advanced/<executable_name>/data/", // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
+ "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+ "../../8_Android/<executable_name>/data/", // up 2 in tree, "/8_Android/<executable_name>/" subdir
+ "../../samples/<executable_name>/data/", // up 2 in tree, "/samples/<executable_name>/" subdir
+ "../../../", // up 3 in tree
+ "../../../src/<executable_name>/", // up 3 in tree, "/src/<executable_name>/" subdir
+ "../../../src/<executable_name>/data/", // up 3 in tree, "/src/<executable_name>/data/" subdir
+ "../../../src/<executable_name>/src/", // up 3 in tree, "/src/<executable_name>/src/" subdir
+ "../../../src/<executable_name>/inc/", // up 3 in tree, "/src/<executable_name>/inc/" subdir
+ "../../../sandbox/<executable_name>/", // up 3 in tree, "/sandbox/<executable_name>/" subdir
+ "../../../sandbox/<executable_name>/data/", // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
+ "../../../sandbox/<executable_name>/src/", // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
+ "../../../sandbox/<executable_name>/inc/", // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
+ "../../../0_Simple/<executable_name>/data/", // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+ "../../../1_Utilities/<executable_name>/data/", // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+ "../../../2_Graphics/<executable_name>/data/", // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+ "../../../3_Imaging/<executable_name>/data/", // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+ "../../../4_Finance/<executable_name>/data/", // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+ "../../../5_Simulations/<executable_name>/data/", // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+ "../../../6_Advanced/<executable_name>/data/", // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+ "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+ "../../../8_Android/<executable_name>/data/", // up 3 in tree, "/8_Android/<executable_name>/" subdir
+ "../../../0_Simple/<executable_name>/", // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+ "../../../1_Utilities/<executable_name>/", // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+ "../../../2_Graphics/<executable_name>/", // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+ "../../../3_Imaging/<executable_name>/", // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+ "../../../4_Finance/<executable_name>/", // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+ "../../../5_Simulations/<executable_name>/", // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+ "../../../6_Advanced/<executable_name>/", // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+ "../../../7_CUDALibraries/<executable_name>/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+ "../../../8_Android/<executable_name>/", // up 3 in tree, "/8_Android/<executable_name>/" subdir
+ "../../../samples/<executable_name>/data/", // up 3 in tree, "/samples/<executable_name>/" subdir
+ "../../../common/", // up 3 in tree, "../../../common/" subdir
+ "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
+ "../../../data/", // up 3 in tree, "../../../data/" subdir
+ "../../../../", // up 4 in tree
+ "../../../../src/<executable_name>/", // up 4 in tree, "/src/<executable_name>/" subdir
+ "../../../../src/<executable_name>/data/", // up 4 in tree, "/src/<executable_name>/data/" subdir
+ "../../../../src/<executable_name>/src/", // up 4 in tree, "/src/<executable_name>/src/" subdir
+ "../../../../src/<executable_name>/inc/", // up 4 in tree, "/src/<executable_name>/inc/" subdir
+ "../../../../sandbox/<executable_name>/", // up 4 in tree, "/sandbox/<executable_name>/" subdir
+ "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
+ "../../../../sandbox/<executable_name>/src/", // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
+ "../../../../sandbox/<executable_name>/inc/", // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
+ "../../../../0_Simple/<executable_name>/data/", // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+ "../../../../1_Utilities/<executable_name>/data/", // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+ "../../../../2_Graphics/<executable_name>/data/", // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+ "../../../../3_Imaging/<executable_name>/data/", // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+ "../../../../4_Finance/<executable_name>/data/", // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+ "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+ "../../../../6_Advanced/<executable_name>/data/", // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+ "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+ "../../../../8_Android/<executable_name>/data/", // up 4 in tree, "/8_Android/<executable_name>/" subdir
+ "../../../../0_Simple/<executable_name>/", // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+ "../../../../1_Utilities/<executable_name>/", // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+ "../../../../2_Graphics/<executable_name>/", // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+ "../../../../3_Imaging/<executable_name>/", // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+ "../../../../4_Finance/<executable_name>/", // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+ "../../../../5_Simulations/<executable_name>/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+ "../../../../6_Advanced/<executable_name>/", // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+ "../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+ "../../../../8_Android/<executable_name>/", // up 4 in tree, "/8_Android/<executable_name>/" subdir
+ "../../../../samples/<executable_name>/data/", // up 4 in tree, "/samples/<executable_name>/" subdir
+ "../../../../common/", // up 4 in tree, "../../../common/" subdir
+ "../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir
+ "../../../../data/", // up 4 in tree, "../../../data/" subdir
+ "../../../../../", // up 5 in tree
+ "../../../../../src/<executable_name>/", // up 5 in tree, "/src/<executable_name>/" subdir
+ "../../../../../src/<executable_name>/data/", // up 5 in tree, "/src/<executable_name>/data/" subdir
+ "../../../../../src/<executable_name>/src/", // up 5 in tree, "/src/<executable_name>/src/" subdir
+ "../../../../../src/<executable_name>/inc/", // up 5 in tree, "/src/<executable_name>/inc/" subdir
+ "../../../../../sandbox/<executable_name>/", // up 5 in tree, "/sandbox/<executable_name>/" subdir
+ "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
+ "../../../../../sandbox/<executable_name>/src/", // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
+ "../../../../../sandbox/<executable_name>/inc/", // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
+ "../../../../../0_Simple/<executable_name>/data/", // up 5 in tree, "/0_Simple/<executable_name>/" subdir
+ "../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
+ "../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
+ "../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
+ "../../../../../4_Finance/<executable_name>/data/", // up 5 in tree, "/4_Finance/<executable_name>/" subdir
+ "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
+ "../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
+ "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+ "../../../../../8_Android/<executable_name>/data/", // up 5 in tree, "/8_Android/<executable_name>/" subdir
+ "../../../../../samples/<executable_name>/data/", // up 5 in tree, "/samples/<executable_name>/" subdir
+ "../../../../../common/", // up 5 in tree, "../../../common/" subdir
+ "../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir
+ };
+
+ // Extract the executable name
+ std::string executable_name;
+
+ if (executable_path != 0)
+ {
+ executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ // Windows path delimiter
+ size_t delimiter_pos = executable_name.find_last_of('\\');
+ executable_name.erase(0, delimiter_pos + 1);
+
+ if (executable_name.rfind(".exe") != std::string::npos)
+ {
+ // we strip .exe, only if the .exe is found
+ executable_name.resize(executable_name.size() - 4);
+ }
+
+#else
+ // Linux & OSX path delimiter
+ size_t delimiter_pos = executable_name.find_last_of('/');
+ executable_name.erase(0,delimiter_pos+1);
+#endif
+ }
+
+ // Loop over all search paths and return the first hit
+ for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
+ {
+ std::string path(searchPath[i]);
+ size_t executable_name_pos = path.find("<executable_name>");
+
+ // If there is executable_name variable in the searchPath
+ // replace it with the value
+ if (executable_name_pos != std::string::npos)
+ {
+ if (executable_path != 0)
+ {
+ path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
+ }
+ else
+ {
+ // Skip this path entry if no executable argument is given
+ continue;
+ }
+ }
+
+#ifdef _DEBUG
+ printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+ // Test if the file exists
+ path.append(filename);
+ FILE *fp;
+ FOPEN(fp, path.c_str(), "rb");
+
+ if (fp != NULL)
+ {
+ fclose(fp);
+ // File found
+ // returning an allocated array here for backwards compatibility reasons
+ char *file_path = (char *) malloc(path.length() + 1);
+ STRCPY(file_path, path.length() + 1, path.c_str());
+ return file_path;
+ }
+
+ if (fp)
+ {
+ fclose(fp);
+ }
+ }
+
+ // File not found
+ return 0;
+}
+
+#endif
diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_timer.h b/src/algorithms/tracking/libs/cudahelpers/helper_timer.h
new file mode 100644
index 0000000..39ddc77
--- /dev/null
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_timer.h
@@ -0,0 +1,499 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// Helper Timing Functions
+#ifndef HELPER_TIMER_H
+#define HELPER_TIMER_H
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception.h>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions
+// But rather in a self contained class interface
+class StopWatchInterface
+{
+ public:
+ StopWatchInterface() {};
+ virtual ~StopWatchInterface() {};
+
+ public:
+ //! Start time measurement
+ virtual void start() = 0;
+
+ //! Stop time measurement
+ virtual void stop() = 0;
+
+ //! Reset time counters to zero
+ virtual void reset() = 0;
+
+ //! Time in msec. after start. If the stop watch is still running (i.e. there
+ //! was no call to stop()) then the elapsed time is returned, otherwise the
+ //! time between the last start() and stop call is returned
+ virtual float getTime() = 0;
+
+ //! Mean time to date based on the number of times the stopwatch has been
+ //! _stopped_ (ie finished sessions) and the current total time
+ virtual float getAverageTime() = 0;
+};
+
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface
+{
+ public:
+ //! Constructor, default
+ StopWatchWin() :
+ start_time(), end_time(),
+ diff_time(0.0f), total_time(0.0f),
+ running(false), clock_sessions(0), freq(0), freq_set(false)
+ {
+ if (! freq_set)
+ {
+ // helper variable
+ LARGE_INTEGER temp;
+
+ // get the tick frequency from the OS
+ QueryPerformanceFrequency((LARGE_INTEGER *) &temp);
+
+ // convert to type in which it is needed
+ freq = ((double) temp.QuadPart) / 1000.0;
+
+ // rememeber query
+ freq_set = true;
+ }
+ };
+
+ // Destructor
+ ~StopWatchWin() { };
+
+ public:
+ //! Start time measurement
+ inline void start();
+
+ //! Stop time measurement
+ inline void stop();
+
+ //! Reset time counters to zero
+ inline void reset();
+
+ //! Time in msec. after start. If the stop watch is still running (i.e. there
+ //! was no call to stop()) then the elapsed time is returned, otherwise the
+ //! time between the last start() and stop call is returned
+ inline float getTime();
+
+ //! Mean time to date based on the number of times the stopwatch has been
+ //! _stopped_ (ie finished sessions) and the current total time
+ inline float getAverageTime();
+
+ private:
+ // member variables
+
+ //! Start of measurement
+ LARGE_INTEGER start_time;
+ //! End of measurement
+ LARGE_INTEGER end_time;
+
+ //! Time difference between the last start and stop
+ float diff_time;
+
+ //! TOTAL time difference between starts and stops
+ float total_time;
+
+ //! flag if the stop watch is running
+ bool running;
+
+ //! Number of times clock has been started
+ //! and stopped to allow averaging
+ int clock_sessions;
+
+ //! tick frequency
+ double freq;
+
+ //! flag if the frequency has been set
+ bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::start()
+{
+ QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+ running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::stop()
+{
+ QueryPerformanceCounter((LARGE_INTEGER *) &end_time);
+ diff_time = (float)
+ (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
+
+ total_time += diff_time;
+ clock_sessions++;
+ running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::reset()
+{
+ diff_time = 0;
+ total_time = 0;
+ clock_sessions = 0;
+
+ if (running)
+ {
+ QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchWin::getTime()
+{
+ // Return the TOTAL time to date
+ float retval = total_time;
+
+ if (running)
+ {
+ LARGE_INTEGER temp;
+ QueryPerformanceCounter((LARGE_INTEGER *) &temp);
+ retval += (float)
+ (((double)(temp.QuadPart - start_time.QuadPart)) / freq);
+ }
+
+ return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchWin::getAverageTime()
+{
+ return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <ctime>
+#include <sys/time.h>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface
+{
+ public:
+ //! Constructor, default
+ StopWatchLinux() :
+ start_time(), diff_time(0.0), total_time(0.0),
+ running(false), clock_sessions(0)
+ { };
+
+ // Destructor
+ virtual ~StopWatchLinux()
+ { };
+
+ public:
+ //! Start time measurement
+ inline void start();
+
+ //! Stop time measurement
+ inline void stop();
+
+ //! Reset time counters to zero
+ inline void reset();
+
+ //! Time in msec. after start. If the stop watch is still running (i.e. there
+ //! was no call to stop()) then the elapsed time is returned, otherwise the
+ //! time between the last start() and stop call is returned
+ inline float getTime();
+
+ //! Mean time to date based on the number of times the stopwatch has been
+ //! _stopped_ (ie finished sessions) and the current total time
+ inline float getAverageTime();
+
+ private:
+
+ // helper functions
+
+ //! Get difference between start time and current time
+ inline float getDiffTime();
+
+ private:
+
+ // member variables
+
+ //! Start of measurement
+ struct timeval start_time;
+
+ //! Time difference between the last start and stop
+ float diff_time;
+
+ //! TOTAL time difference between starts and stops
+ float total_time;
+
+ //! flag if the stop watch is running
+ bool running;
+
+ //! Number of times clock has been started
+ //! and stopped to allow averaging
+ int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::start()
+{
+ gettimeofday(&start_time, 0);
+ running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::stop()
+{
+ diff_time = getDiffTime();
+ total_time += diff_time;
+ running = false;
+ clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::reset()
+{
+ diff_time = 0;
+ total_time = 0;
+ clock_sessions = 0;
+
+ if (running)
+ {
+ gettimeofday(&start_time, 0);
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getTime()
+{
+ // Return the TOTAL time to date
+ float retval = total_time;
+
+ if (running)
+ {
+ retval += getDiffTime();
+ }
+
+ return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getAverageTime()
+{
+ return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getDiffTime()
+{
+ struct timeval t_time;
+ gettimeofday(&t_time, 0);
+
+ // time difference in milli-seconds
+ return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
+ + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkCreateTimer(StopWatchInterface **timer_interface)
+{
+ //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ *timer_interface = (StopWatchInterface *)new StopWatchWin();
+#else
+ *timer_interface = (StopWatchInterface *)new StopWatchLinux();
+#endif
+ return (*timer_interface != NULL) ? true : false;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkDeleteTimer(StopWatchInterface **timer_interface)
+{
+ //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+ if (*timer_interface)
+ {
+ delete *timer_interface;
+ *timer_interface = NULL;
+ }
+
+ return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkStartTimer(StopWatchInterface **timer_interface)
+{
+ //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+ if (*timer_interface)
+ {
+ (*timer_interface)->start();
+ }
+
+ return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkStopTimer(StopWatchInterface **timer_interface)
+{
+ // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+ if (*timer_interface)
+ {
+ (*timer_interface)->stop();
+ }
+
+ return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkResetTimer(StopWatchInterface **timer_interface)
+{
+ // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+ if (*timer_interface)
+ {
+ (*timer_interface)->reset();
+ }
+
+ return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float
+sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
+{
+ // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
+ if (*timer_interface)
+ {
+ return (*timer_interface)->getAverageTime();
+ }
+ else
+ {
+ return 0.0f;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+sdkGetTimerValue(StopWatchInterface **timer_interface)
+{
+ // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+ if (*timer_interface)
+ {
+ return (*timer_interface)->getTime();
+ }
+ else
+ {
+ return 0.0f;
+ }
+}
+
+#endif // HELPER_TIMER_H
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-hamradio/gnss-sdr.git
More information about the pkg-hamradio-commits
mailing list