[beignet] 01/10: Imported Upstream version 1.3.0
Rebecca Palmer
rnpalmer-guest at moszumanska.debian.org
Sun Jan 22 22:40:00 UTC 2017
This is an automated email from the git hooks/post-receive script.
rnpalmer-guest pushed a commit to branch master
in repository beignet.
commit 037c0b29c5b06f89e64ffc50542cbc89205317aa
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date: Sat Jan 21 13:30:00 2017 +0000
Imported Upstream version 1.3.0
---
Android.common.mk | 2 +-
CMake/FindMesaSrc.cmake | 26 -
CMakeLists.txt | 102 +-
GetGenID.sh | 50 +-
backend/CMakeLists.txt | 5 +-
backend/src/Android.mk | 11 +
backend/src/CMakeLists.txt | 15 +
backend/src/GBEConfig.h.in | 2 +
backend/src/backend/context.cpp | 9 +-
backend/src/backend/gen/gen_mesa_disasm.c | 77 +-
backend/src/backend/gen75_encoder.cpp | 4 +-
backend/src/backend/gen75_encoder.hpp | 4 +-
backend/src/backend/gen7_encoder.cpp | 2 +-
backend/src/backend/gen7_encoder.hpp | 2 +-
backend/src/backend/gen8_context.cpp | 221 +-
backend/src/backend/gen8_context.hpp | 8 +
backend/src/backend/gen8_encoder.cpp | 264 +-
backend/src/backend/gen8_encoder.hpp | 16 +-
backend/src/backend/gen8_instruction.hpp | 59 +-
backend/src/backend/gen9_context.cpp | 71 +-
backend/src/backend/gen9_context.hpp | 1 +
backend/src/backend/gen9_encoder.cpp | 236 +
backend/src/backend/gen9_encoder.hpp | 10 +-
backend/src/backend/gen9_instruction.hpp | 84 +
backend/src/backend/gen_context.cpp | 572 +-
backend/src/backend/gen_context.hpp | 9 +-
backend/src/backend/gen_defs.hpp | 13 +
backend/src/backend/gen_encoder.cpp | 215 +-
backend/src/backend/gen_encoder.hpp | 49 +-
.../src/backend/gen_insn_gen7_schedule_info.hxx | 7 +
backend/src/backend/gen_insn_selection.cpp | 2125 +++-
backend/src/backend/gen_insn_selection.hpp | 16 +-
backend/src/backend/gen_insn_selection.hxx | 8 +
.../src/backend/gen_insn_selection_optimize.cpp | 22 +-
backend/src/backend/gen_insn_selection_output.cpp | 138 +-
backend/src/backend/gen_insn_selection_output.hpp | 2 +-
backend/src/backend/gen_program.cpp | 21 +-
backend/src/backend/gen_reg_allocation.cpp | 72 +-
backend/src/backend/gen_register.hpp | 18 +
backend/src/backend/program.cpp | 165 +-
backend/src/backend/program.h | 22 +
backend/src/backend/program.hpp | 24 +
backend/src/gbe_bin_interpreter.cpp | 6 +
backend/src/ir/constant.cpp | 6 +-
backend/src/ir/constant.hpp | 7 +-
backend/src/ir/function.cpp | 7 +-
backend/src/ir/function.hpp | 17 +-
backend/src/ir/instruction.cpp | 81 +-
backend/src/ir/instruction.hpp | 15 +-
backend/src/ir/instruction.hxx | 1 +
backend/src/ir/lowering.cpp | 11 +-
backend/src/ir/profile.cpp | 17 +-
backend/src/ir/profile.hpp | 54 +-
backend/src/ir/profiling.cpp | 2 +-
backend/src/ir/register.cpp | 3 +
backend/src/ir/register.hpp | 8 +-
backend/src/ir/reloc.cpp | 87 +
backend/src/ir/reloc.hpp | 90 +
backend/src/ir/type.hpp | 4 +-
backend/src/ir/unit.cpp | 6 +-
backend/src/ir/unit.hpp | 15 +-
backend/src/libocl/Android.mk | 1 -
backend/src/libocl/CMakeLists.txt | 145 +-
backend/src/libocl/include/ocl.h | 13 +-
backend/src/libocl/include/ocl_atom_20.h | 188 +
backend/src/libocl/include/ocl_enqueue.h | 90 +
backend/src/libocl/include/ocl_image.h | 222 +-
backend/src/libocl/include/ocl_misc.h | 9 +
backend/src/libocl/include/ocl_pipe.h | 51 +
backend/src/libocl/include/ocl_sync.h | 7 +-
backend/src/libocl/include/ocl_types.h | 43 +-
backend/src/libocl/include/ocl_vload_20.h | 150 +
backend/src/libocl/include/ocl_workitem.h | 20 +-
backend/src/libocl/script/gen_vector.py | 5 +-
backend/src/libocl/script/ocl_integer.def | 1 +
backend/src/libocl/script/ocl_math_20.def | 151 +
backend/src/libocl/src/ocl_atom_20.cl | 381 +
backend/src/libocl/src/ocl_atomic_20.ll | 165 +
backend/src/libocl/src/ocl_barrier.ll | 27 +-
backend/src/libocl/src/ocl_barrier_20.ll | 25 +
backend/src/libocl/src/ocl_clz_20.ll | 65 +
backend/src/libocl/src/ocl_ctz.ll | 65 +
backend/src/libocl/src/ocl_ctz_20.ll | 65 +
backend/src/libocl/src/ocl_enqueue.cl | 238 +
backend/src/libocl/src/ocl_geometric.cl | 4 +
backend/src/libocl/src/ocl_image.cl | 218 +-
backend/src/libocl/src/ocl_memcpy.cl | 15 +
backend/src/libocl/src/ocl_memset.cl | 3 +
backend/src/libocl/src/ocl_misc.cl | 24 +
backend/src/libocl/src/ocl_pipe.cl | 296 +
backend/src/libocl/src/ocl_sync.cl | 6 +-
backend/src/libocl/src/ocl_vload_20.cl | 284 +
backend/src/libocl/src/ocl_workitem.cl | 25 +-
backend/src/libocl/tmpl/ocl_defines.tmpl.h | 7 +-
backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 12 +
backend/src/libocl/tmpl/ocl_integer.tmpl.h | 18 +
backend/src/libocl/tmpl/ocl_math_20.tmpl.cl | 3801 ++++++
backend/src/libocl/tmpl/ocl_math_20.tmpl.h | 209 +
backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 292 +-
backend/src/libocl/tmpl/ocl_simd.tmpl.h | 131 +-
backend/src/llvm/ExpandLargeIntegers.cpp | 5 +-
backend/src/llvm/PromoteIntegers.cpp | 1 +
backend/src/llvm/StripAttributes.cpp | 6 +-
backend/src/llvm/llvm_bitcode_link.cpp | 49 +-
backend/src/llvm/llvm_device_enqueue.cpp | 417 +
backend/src/llvm/llvm_gen_backend.cpp | 668 +-
backend/src/llvm/llvm_gen_backend.hpp | 7 +-
backend/src/llvm/llvm_gen_ocl_function.hxx | 66 +-
backend/src/llvm/llvm_intrinsic_lowering.cpp | 2 +
backend/src/llvm/llvm_passes.cpp | 25 +-
backend/src/llvm/llvm_scalarize.cpp | 42 +-
backend/src/llvm/llvm_to_gen.cpp | 22 +-
backend/src/llvm/llvm_unroll.cpp | 14 +-
backend/src/ocl_common_defines.h | 11 +-
docs/Beignet.mdwn | 13 +
docs/NEWS.mdwn | 3 +
docs/howto/android-build-howto.mdwn | 64 +
include/CL/cl.h | 389 +-
include/CL/cl.hpp | 12452 -------------------
include/CL/cl_d3d10.h | 7 +-
include/CL/cl_d3d11.h | 7 +-
include/CL/cl_dx9_media_sharing.h | 9 +-
include/CL/cl_egl.h | 9 +-
include/CL/cl_ext.h | 49 +-
include/CL/cl_gl.h | 7 +-
include/CL/cl_gl_ext.h | 7 +-
include/CL/cl_platform.h | 38 +-
include/CL/opencl.h | 7 +-
kernels/compiler_atomic_functions_20.cl | 53 +
kernels/compiler_ceil64.spir | Bin 0 -> 2152 bytes
kernels/compiler_ctz.cl | 16 +
kernels/compiler_device_enqueue.cl | 18 +
kernels/compiler_generic_atomic.cl | 33 +
kernels/compiler_generic_pointer.cl | 33 +
kernels/compiler_pipe_builtin.cl | 117 +
kernels/compiler_program_global.cl | 77 +
kernels/compiler_sub_group_shuffle.cl | 22 +-
kernels/compiler_sub_group_shuffle_down.cl | 23 +-
kernels/compiler_sub_group_shuffle_up.cl | 23 +-
kernels/compiler_sub_group_shuffle_xor.cl | 23 +-
kernels/compiler_subgroup_broadcast.cl | 26 +-
kernels/compiler_subgroup_buffer_block_read.cl | 47 +-
kernels/compiler_subgroup_buffer_block_write.cl | 44 +-
kernels/compiler_subgroup_image_block_read.cl | 49 +-
kernels/compiler_subgroup_image_block_write.cl | 46 +-
kernels/compiler_subgroup_reduce.cl | 41 +
kernels/compiler_subgroup_scan_exclusive.cl | 55 +
kernels/compiler_subgroup_scan_inclusive.cl | 55 +
src/Android.mk | 18 +-
src/CMakeLists.txt | 42 +-
src/cl_accelerator_intel.c | 17 +-
src/cl_accelerator_intel.h | 11 +-
src/cl_api.c | 3369 +----
src/cl_api_command_queue.c | 233 +
src/cl_api_context.c | 174 +
src/cl_api_device_id.c | 90 +
src/cl_api_event.c | 330 +
src/cl_api_kernel.c | 422 +
src/cl_api_mem.c | 2435 ++++
src/cl_api_platform_id.c | 65 +
src/cl_api_program.c | 171 +
src/cl_api_sampler.c | 127 +
src/cl_base_object.c | 140 +
src/cl_base_object.h | 84 +
src/cl_cmrt.cpp | 2 +-
src/cl_command_queue.c | 326 +-
src/cl_command_queue.h | 95 +-
src/cl_command_queue_enqueue.c | 330 +
src/cl_command_queue_gen7.c | 101 +-
src/cl_context.c | 270 +-
src/cl_context.h | 61 +-
src/cl_device_enqueue.c | 201 +
.../src/ocl_sync.cl => src/cl_device_enqueue.h | 25 +-
src/cl_device_id.c | 588 +-
src/cl_device_id.h | 34 +-
src/cl_driver.h | 57 +-
src/cl_driver_defs.c | 12 +-
src/cl_enqueue.c | 582 +-
src/cl_enqueue.h | 56 +-
src/cl_event.c | 1171 +-
src/cl_event.h | 148 +-
src/cl_extensions.c | 5 +-
src/cl_extensions.h | 11 +-
src/cl_gbe_loader.cpp | 25 +
src/cl_gbe_loader.h | 5 +
src/cl_gl_api.c | 19 +-
src/cl_gt_device.h | 21 +-
src/cl_image.c | 24 +-
src/cl_image.h | 1 +
src/cl_kernel.c | 112 +-
src/cl_kernel.h | 26 +-
src/cl_khr_icd.c | 30 +-
src/cl_khr_icd.h | 2 -
src/cl_mem.c | 648 +-
src/cl_mem.h | 114 +-
src/cl_mem_gl.c | 7 +-
src/cl_platform_id.c | 53 +-
src/cl_platform_id.h | 16 +-
src/cl_program.c | 143 +-
src/cl_program.h | 14 +-
src/cl_sampler.c | 54 +-
src/cl_sampler.h | 33 +-
src/cl_thread.c | 329 -
src/cl_thread.h | 52 -
src/cl_utils.c | 86 +
src/cl_utils.h | 90 +-
src/intel/intel_cl_gl_share_image_info.h | 18 +
src/intel/intel_dri_resource_sharing.c | 208 -
src/intel/intel_dri_resource_sharing.h | 39 -
src/intel/intel_dri_resource_sharing_int.h | 143 -
src/intel/intel_driver.c | 1178 +-
src/intel/intel_driver.h | 1 +
src/intel/intel_gpgpu.c | 99 +-
src/intel/intel_gpgpu.h | 1 +
src/performance.c | 6 +
src/x11/mesa_egl_extension.c | 306 -
src/x11/mesa_egl_extension.h | 20 -
src/x11/mesa_egl_res_share.c | 135 -
src/x11/mesa_egl_res_share.h | 44 -
utests/CMakeLists.txt | 39 +-
utests/compiler_atomic_functions_20.cpp | 106 +
utests/compiler_ctz.cpp | 62 +
utests/compiler_device_enqueue.cpp | 36 +
utests/compiler_fill_gl_image.cpp | 69 +-
utests/compiler_generic_atomic.cpp | 45 +
utests/compiler_generic_pointer.cpp | 46 +
utests/compiler_pipe_builtin.cpp | 69 +
utests/compiler_program_global.cpp | 80 +
utests/compiler_sampler.cpp | 14 +-
utests/compiler_sub_group_shuffle.cpp | 52 +-
utests/compiler_sub_group_shuffle_down.cpp | 54 +-
utests/compiler_sub_group_shuffle_up.cpp | 54 +-
utests/compiler_sub_group_shuffle_xor.cpp | 54 +-
utests/compiler_subgroup_broadcast.cpp | 38 +-
utests/compiler_subgroup_buffer_block_read.cpp | 73 +-
utests/compiler_subgroup_buffer_block_write.cpp | 74 +-
utests/compiler_subgroup_image_block_read.cpp | 98 +-
utests/compiler_subgroup_image_block_write.cpp | 73 +-
utests/compiler_subgroup_reduce.cpp | 170 +-
utests/compiler_subgroup_scan_exclusive.cpp | 173 +-
utests/compiler_subgroup_scan_inclusive.cpp | 166 +-
utests/load_program_from_spir.cpp | 5 +-
utests/multi_queue_events.cpp | 129 +
utests/runtime_barrier_list.cpp | 11 +-
utests/runtime_event.cpp | 4 +-
utests/runtime_marker_list.cpp | 13 +-
utests/runtime_pipe_query.cpp | 15 +
utests/setenv.sh.in | 2 +
utests/utest_helper.cpp | 203 +-
utests/utest_helper.hpp | 11 +-
250 files changed, 24693 insertions(+), 21135 deletions(-)
diff --git a/Android.common.mk b/Android.common.mk
index dcb3c7c..60cd23b 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -2,7 +2,7 @@
#include $(CLEAR_VARS)
TOP_C_INCLUDE := bionic $(BEIGNET_ROOT_PATH)/include
-TOP_CPPFLAGS := -Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++0x -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
+TOP_CPPFLAGS := -Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++11 -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
TOP_CFLAGS := -Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
LLVM_INCLUDE_DIRS := external/llvm/device/include\
diff --git a/CMake/FindMesaSrc.cmake b/CMake/FindMesaSrc.cmake
deleted file mode 100644
index 978cb4e..0000000
--- a/CMake/FindMesaSrc.cmake
+++ /dev/null
@@ -1,26 +0,0 @@
-#
-# Try to find mesa source code
-# Once done this will define
-#
-# MESA_SOURCE_FOUND
-# MESA_SOURCE_INCLUDES
-#
-
-# Find mesa source code.
-FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
- $ENV{MESA_SOURCE_DIR}
- ${MAKE_CURRENT_SOURCE_DIR}/../mesa
- ~/mesa
- DOC "The mesa source directory which is needed for cl_khr_gl_sharing.")
-
-IF(MESA_SOURCE_PREFIX)
-SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
- ${MESA_SOURCE_PREFIX}/include
- ${MESA_SOURCE_PREFIX}/src/mapi
- ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
- ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i915/
- ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
-SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
-ELSE(MESA_SOURCE_PREFIX)
-SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
-ENDIF(MESA_SOURCE_PREFIX)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bac054c..02b5d88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,11 +16,6 @@ endif ()
CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
PROJECT(OCL)
-set (LIBCL_DRIVER_VERSION_MAJOR 1)
-set (LIBCL_DRIVER_VERSION_MINOR 2)
-set (LIBCL_DRIVER_VERSION_PATCH 1)
-set (LIBCL_C_VERSION_MAJOR 1)
-set (LIBCL_C_VERSION_MINOR 2)
if( ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
set(COMPILER "CLANG")
elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
@@ -28,11 +23,6 @@ elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
set(COMPILER "ICC")
endif()
-configure_file (
- "src/OCLConfig.h.in"
- "src/OCLConfig.h"
-)
-
set (NOT_BUILD_STAND_ALONE_UTEST 1)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
@@ -154,41 +144,38 @@ IF(DRM_INTEL_FOUND)
ELSE(HAVE_DRM_INTEL_SUBSLICE_TOTAL)
MESSAGE(STATUS "Disable subslice total query support")
ENDIF(HAVE_DRM_INTEL_SUBSLICE_TOTAL)
- CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_pooled_eu" "" HAVE_DRM_INTEL_POOLED_EU)
+ CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_pooled_eu" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_POOLED_EU)
IF(HAVE_DRM_INTEL_POOLED_EU)
MESSAGE(STATUS "Enable pooled eu query support")
ELSE(HAVE_DRM_INTEL_POOLED_EU)
MESSAGE(STATUS "Disable pooled eu query support")
ENDIF(HAVE_DRM_INTEL_POOLED_EU)
- CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_min_eu_in_pool" "" HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+ CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_min_eu_in_pool" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_MIN_EU_IN_POOL)
IF(HAVE_DRM_INTEL_MIN_EU_IN_POOL)
MESSAGE(STATUS "Enable min eu in pool query support")
ELSE(HAVE_DRM_INTEL_MIN_EU_IN_POOL)
MESSAGE(STATUS "Disable min eu in pool query support")
ENDIF(HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+ CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_bo_set_softpin_offset" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_BO_SET_SOFTPIN)
ELSE(DRM_INTEL_FOUND)
MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
ENDIF(DRM_INTEL_FOUND)
# CMRT
+#disable CMRT as default, since we do not see real case,
+#but see build issue of this feature
+OPTION(INVOKE_CMRT "Enable CMRT" OFF)
+IF(INVOKE_CMRT)
pkg_check_modules(CMRT libcmrt)
IF(CMRT_FOUND)
INCLUDE_DIRECTORIES(${CMRT_INCLUDE_DIRS})
ENDIF(CMRT_FOUND)
+ENDIF(INVOKE_CMRT)
# Threads
Find_Package(Threads)
IF(X11_FOUND)
-# OpenGL (not use cmake helper)
-pkg_check_modules(OPENGL gl)
-IF(OPENGL_FOUND)
- INCLUDE_DIRECTORIES(${OPENGL_INCLUDE_DIRS})
- MESSAGE(STATUS "Looking for OpenGL - found at ${OPENGL_PREFIX}")
-ELSE(OPENGL_FOUND)
- MESSAGE(STATUS "Looking for OpenGL - not found")
-ENDIF(OPENGL_FOUND)
-
# Xext
pkg_check_modules(XEXT REQUIRED xext)
IF(XEXT_FOUND)
@@ -208,20 +195,22 @@ ELSE(XFIXES_FOUND)
ENDIF(XFIXES_FOUND)
ENDIF(X11_FOUND)
-pkg_check_modules(EGL egl)
-IF(EGL_FOUND)
- MESSAGE(STATUS "Looking for EGL - found at ${EGL_PREFIX}")
-ELSE(EGL_FOUND)
- MESSAGE(STATUS "Looking for EGL - not found")
-ENDIF(EGL_FOUND)
-
-# cl_khr_gl_sharing requires to build with mesa source
-#Find_Package(MesaSrc)
-#IF(MESA_SOURCE_FOUND)
-# MESSAGE(STATUS "Looking for mesa source code - found at ${MESA_SOURCE_PREFIX}")
-#ELSE(MESA_SOURCE_FOUND)
-# MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
-#ENDIF(MESA_SOURCE_FOUND)
+OPTION(ENABLE_GL_SHARING "cl_khr_gl_sharing" OFF)
+
+IF(ENABLE_GL_SHARING)
+ pkg_check_modules(OPENGL REQUIRED gl)
+ IF(OPENGL_FOUND)
+ MESSAGE(STATUS "Looking for OpenGL - found at ${OPENGL_PREFIX}")
+ ELSE(OPENGL_FOUND)
+ MESSAGE(STATUS "Looking for OpenGL - not found")
+ ENDIF(OPENGL_FOUND)
+ pkg_check_modules(EGL REQUIRED egl>=11.0.0)
+ IF(EGL_FOUND)
+ MESSAGE(STATUS "Looking for EGL - found at ${EGL_PREFIX}")
+ ELSE(EGL_FOUND)
+ MESSAGE(STATUS "Looking for EGL - not found")
+ ENDIF(EGL_FOUND)
+ENDIF(ENABLE_GL_SHARING)
Find_Package(OCLIcd)
IF(OCLIcd_FOUND)
@@ -238,10 +227,51 @@ ENDIF(OCLIcd_FOUND)
Find_Package(PythonInterp)
OPTION(EXPERIMENTAL_DOUBLE "Enable experimental double support" OFF)
-IF(EXPERIMENTAL_DOUBLE)
+IF (EXPERIMENTAL_DOUBLE)
ADD_DEFINITIONS(-DENABLE_FP64)
ENDIF(EXPERIMENTAL_DOUBLE)
+OPTION(ENABLE_OPENCL_20 "Enable opencl 2.0 support" OFF)
+IF (ENABLE_OPENCL_20)
+ Find_Program(LSPCI lspci)
+ IF (NOT LSPCI)
+ MESSAGE(FATAL_ERROR "Looking for lspci - not found")
+ ENDIF (NOT LSPCI)
+ EXECUTE_PROCESS(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh"
+ RESULT_VARIABLE SUPPORT_OCL20_DEVICE
+ OUTPUT_VARIABLE PCI_ID_NOT_USED)
+
+ IF (NOT SUPPORT_OCL20_DEVICE EQUAL 1)
+ MESSAGE(FATAL_ERROR "Only SKL and newer devices support OpenCL 2.0 now, your device don't support.")
+ ENDIF (NOT SUPPORT_OCL20_DEVICE EQUAL 1)
+
+ IF (NOT HAVE_DRM_INTEL_BO_SET_SOFTPIN)
+ MESSAGE(FATAL_ERROR "Please update libdrm to version 2.4.66 or later to enable OpenCL 2.0.")
+ ENDIF (NOT HAVE_DRM_INTEL_BO_SET_SOFTPIN)
+
+ IF (LLVM_VERSION_NODOT VERSION_LESS 39)
+ MESSAGE(FATAL_ERROR "Please update LLVM to version 3.9 or later to enable OpenCL 2.0.")
+ ENDIF (LLVM_VERSION_NODOT VERSION_LESS 39)
+
+ ADD_DEFINITIONS(-DENABLE_OPENCL_20)
+ENDIF(ENABLE_OPENCL_20)
+
+set (LIBCL_DRIVER_VERSION_MAJOR 1)
+set (LIBCL_DRIVER_VERSION_MINOR 3)
+set (LIBCL_DRIVER_VERSION_PATCH 0)
+if (ENABLE_OPENCL_20)
+ set (LIBCL_C_VERSION_MAJOR 2)
+ set (LIBCL_C_VERSION_MINOR 0)
+else (ENABLE_OPENCL_20)
+ set (LIBCL_C_VERSION_MAJOR 1)
+ set (LIBCL_C_VERSION_MINOR 2)
+endif (ENABLE_OPENCL_20)
+configure_file (
+ "src/OCLConfig.h.in"
+ "src/OCLConfig.h"
+)
+
+
OPTION(BUILD_EXAMPLES "Build examples" OFF)
IF(BUILD_EXAMPLES)
IF(NOT X11_FOUND)
diff --git a/GetGenID.sh b/GetGenID.sh
index a0e5f85..5e5cafd 100755
--- a/GetGenID.sh
+++ b/GetGenID.sh
@@ -12,34 +12,50 @@ genpciid+=(0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 0d0b 0d1b 0d2b 0d0e 0d1e
genpciid+=(1602 1606 160a 160d 160e 1612 1616 161a 161d 161e 1622 1626 162a 162d 162e)
#BSW
genpciid+=(22b0 22b1 22b2 22b3)
+#Only enable OpenCL 2.0 after SKL.
#SKL
-genpciid+=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 191a 192a 193a)
+genpciid_20=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 191a 192a 193a)
#BXT
-genpciid+=(5a84 5a85)
+genpciid_20+=(5a84 5a85 1a84 1a85)
#KBL
-genpciid+=(5906 5916 5926 5913 5921 5923 5927 5902 5912 5917)
-genpciid+=(590b 591b 593b 5908 590e 591e 5915 590a 591a 591d)
+genpciid_20+=(5906 5916 5926 5913 5921 5923 5927 5902 5912 5917)
+genpciid_20+=(590b 591b 593b 5908 590e 591e 5915 590a 591a 591d)
pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] '{print $1}'))
n=${#pciid[*]}
i=0
m=${#genpciid[*]}
+t=${#genpciid_20[*]}
j=0
while [ $i -lt $n ]
do
- id1=${pciid[$i]}
- let j=0
+ id1=${pciid[$i]}
+ let j=0
- while [ $j -lt $m ]
- do
- id2=${genpciid[$j]}
+ while [ $j -lt $m ]
+ do
+ id2=${genpciid[$j]}
- if [ ${id1} == ${id2} ]
- then
- echo ${id1}
- exit 0
- fi
- let j=j+1
- done
+ if [ ${id1} == ${id2} ]
+ then
+ echo ${id1}
+ exit 0
+ fi
+ let j=j+1
+ done
- let i=i+1
+ let j=0
+ while [ $j -lt $t ]
+ do
+ id2=${genpciid_20[$j]}
+
+ if [ ${id1} == ${id2} ]
+ then
+ echo ${id1}
+ exit 1
+ fi
+ let j=j+1
+ done
+
+ let i=i+1
done
+exit -1
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 915d60f..d2d8710 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -48,9 +48,12 @@ set(LOCAL_OCL_HEADER_DIR "${LOCAL_OCL_HEADER_DIR}" PARENT_SCOPE)
set(LOCAL_OCL_PCH_OBJECT "${LOCAL_OCL_PCH_OBJECT}" PARENT_SCOPE)
set(LOCAL_GBE_OBJECT_DIR ${LOCAL_GBE_OBJECT_DIR} PARENT_SCOPE)
set(LOCAL_INTERP_OBJECT_DIR ${LOCAL_INTERP_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_OCL_BITCODE_BIN_20 "${LOCAL_OCL_BITCODE_BIN_20}" PARENT_SCOPE)
+set(LOCAL_OCL_PCH_OBJECT_20 "${LOCAL_OCL_PCH_OBJECT_20}" PARENT_SCOPE)
set (GBE_BIN_GENERATER
- env OCL_BITCODE_LIB_PATH=${LOCAL_OCL_BITCODE_BIN} OCL_HEADER_FILE_DIR=${LOCAL_OCL_HEADER_DIR} OCL_PCH_PATH=${LOCAL_OCL_PCH_OBJECT})
+ env OCL_BITCODE_LIB_PATH=${LOCAL_OCL_BITCODE_BIN} OCL_HEADER_FILE_DIR=${LOCAL_OCL_HEADER_DIR} OCL_PCH_PATH=${LOCAL_OCL_PCH_OBJECT}
+ OCL_BITCODE_LIB_20_PATH=${LOCAL_OCL_BITCODE_BIN_20} OCL_PCH_20_PATH=${LOCAL_OCL_PCH_OBJECT_20})
if (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
set (GBE_BIN_GENERATER
diff --git a/backend/src/Android.mk b/backend/src/Android.mk
index da4d787..47d4ea7 100644
--- a/backend/src/Android.mk
+++ b/backend/src/Android.mk
@@ -62,6 +62,8 @@ BACKEND_SRC_FILES:= \
ir/immediate.cpp \
ir/structurizer.hpp \
ir/structurizer.cpp \
+ ir/reloc.hpp \
+ ir/reloc.cpp \
backend/context.cpp \
backend/context.hpp \
backend/program.cpp \
@@ -80,6 +82,7 @@ BACKEND_SRC_FILES:= \
llvm/PromoteIntegers.cpp \
llvm/ExpandLargeIntegers.cpp \
llvm/StripAttributes.cpp \
+ llvm/llvm_device_enqueue.cpp \
llvm/llvm_to_gen.cpp \
llvm/llvm_loadstore_optimization.cpp \
llvm/llvm_gen_backend.hpp \
@@ -140,12 +143,16 @@ $(shell echo " #define INTERP_OBJECT_DIR \"/system/lib64/libgbeinterp.so\"" >>
$(shell echo " #define OCL_BITCODE_BIN \"/system/lib/ocl/beignet.bc\"" >> $(gbe_config_file))
$(shell echo " #define OCL_HEADER_DIR \"/system/lib/ocl/include\"" >> $(gbe_config_file))
$(shell echo " #define OCL_PCH_OBJECT \"/system/lib/ocl/beignet.pch\"" >> $(gbe_config_file))
+$(shell echo " #define OCL_BITCODE_BIN_20 \"/system/lib/ocl/beignet_20.bc\"" >> $(gbe_config_file))
+$(shell echo " #define OCL_PCH_OBJECT_20 \"/system/lib/ocl/beigneti_20.pch\"" >> $(gbe_config_file))
$(shell echo "#else /*__x86_64__*/" >> $(gbe_config_file))
$(shell echo " #define GBE_OBJECT_DIR \"/system/lib/libgbe.so\"" >> $(gbe_config_file))
$(shell echo " #define INTERP_OBJECT_DIR \"/system/lib/libgbeinterp.so\"" >> $(gbe_config_file))
$(shell echo " #define OCL_BITCODE_BIN \"/system/lib/ocl/beignet.bc\"" >> $(gbe_config_file))
$(shell echo " #define OCL_HEADER_DIR \"/system/lib/ocl/include\"" >> $(gbe_config_file))
$(shell echo " #define OCL_PCH_OBJECT \"/system/lib/ocl/beignet.pch\"" >> $(gbe_config_file))
+$(shell echo " #define OCL_BITCODE_BIN_20 \"/system/lib/ocl/beignet_20.bc\"" >> $(gbe_config_file))
+$(shell echo " #define OCL_PCH_OBJECT_20 \"/system/lib/ocl/beigneti_20.pch\"" >> $(gbe_config_file))
$(shell echo "#endif" >> $(gbe_config_file))
$(shell echo "#else /*__ANDROID__*/" >> $(gbe_config_file))
$(shell echo " #define GBE_OBJECT_DIR \"\"" >> $(gbe_config_file))
@@ -153,6 +160,8 @@ $(shell echo " #define INTERP_OBJECT_DIR \"\"" >> $(gbe_config_file))
$(shell echo " #define OCL_BITCODE_BIN \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
$(shell echo " #define OCL_HEADER_DIR \"`pwd $(TOP)`/$(generated_path)/libocl/include\"" >> $(gbe_config_file))
$(shell echo " #define OCL_PCH_OBJECT \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
+$(shell echo " #define OCL_BITCODE_BIN_20 \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
+$(shell echo " #define OCL_PCH_OBJECT_20 \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
$(shell echo "#endif" >> $(gbe_config_file))
#Build HOST libgbe.so
@@ -162,6 +171,8 @@ LOCAL_C_INCLUDES := $(TOP_C_INCLUDE) \
$(LLVM_INCLUDE_DIRS)
LOCAL_CPPFLAGS += $(LLVM_CFLAGS) -std=c++11 -fexceptions -DGBE_DEBUG=0 -DGBE_COMPILER_AVAILABLE=1 -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
LOCAL_CFLAGS += $(LLVM_CFLAGS) -fexceptions -DGBE_DEBUG=0 -DGBE_COMPILER_AVAILABLE=1 -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+LOCAL_CPPFLAGS += -Wno-extra-semi -Wno-gnu-anonymous-struct -Wno-nested-anon-types
+LOCAL_CFLAGS += -Wno-extra-semi -Wno-gnu-anonymous-struct -Wno-nested-anon-types
LOCAL_LDLIBS += -lpthread -lm -ldl -lLLVM -lclang
#LOCAL_STATIC_LIBRARIES := $(CLANG_MODULE_LIBS)
LOCAL_SHARED_LIBRARIES := libclang
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 41eb5ec..7c1f4db 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -3,6 +3,10 @@ set (OCL_HEADER_DIR "${BEIGNET_INSTALL_DIR}/include")
set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}/beignet.pch")
set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
+if (ENABLE_OPENCL_20)
+set (OCL_BITCODE_BIN_20 "${BEIGNET_INSTALL_DIR}/beignet_20.bc")
+set (OCL_PCH_OBJECT_20 "${BEIGNET_INSTALL_DIR}/beignet_20.pch")
+endif (ENABLE_OPENCL_20)
configure_file (
"GBEConfig.h.in"
@@ -19,6 +23,10 @@ set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PAREN
set (LOCAL_OCL_BITCODE_BIN "${OCL_OBJECT_DIR}/beignet.bc" PARENT_SCOPE)
set (LOCAL_OCL_HEADER_DIR "${OCL_OBJECT_DIR}/include/" PARENT_SCOPE)
set (LOCAL_OCL_PCH_OBJECT "${OCL_OBJECT_DIR}/beignet.local.pch" PARENT_SCOPE)
+if (ENABLE_OPENCL_20)
+set (LOCAL_OCL_BITCODE_BIN_20 "${OCL_OBJECT_DIR}/beignet_20.bc" PARENT_SCOPE)
+set (LOCAL_OCL_PCH_OBJECT_20 "${OCL_OBJECT_DIR}/beignet_20.local.pch" PARENT_SCOPE)
+endif (ENABLE_OPENCL_20)
set (GBE_SRC
${ocl_blob_file}
@@ -73,6 +81,8 @@ set (GBE_SRC
ir/immediate.cpp
ir/structurizer.hpp
ir/structurizer.cpp
+ ir/reloc.hpp
+ ir/reloc.cpp
backend/context.cpp
backend/context.hpp
backend/program.cpp
@@ -91,6 +101,7 @@ set (GBE_SRC
llvm/ExpandUtils.cpp
llvm/PromoteIntegers.cpp
llvm/ExpandLargeIntegers.cpp
+ llvm/llvm_device_enqueue.cpp
llvm/StripAttributes.cpp
llvm/llvm_to_gen.cpp
llvm/llvm_loadstore_optimization.cpp
@@ -195,6 +206,10 @@ endif ()
install (TARGETS gbe LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
install (FILES ${OCL_OBJECT_DIR}/beignet.bc DESTINATION ${BEIGNET_INSTALL_DIR})
install (FILES ${OCL_OBJECT_DIR}/beignet.pch DESTINATION ${BEIGNET_INSTALL_DIR})
+if (ENABLE_OPENCL_20)
+install (FILES ${OCL_OBJECT_DIR}/beignet_20.bc DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${OCL_OBJECT_DIR}/beignet_20.pch DESTINATION ${BEIGNET_INSTALL_DIR})
+endif (ENABLE_OPENCL_20)
install (FILES ${OCL_HEADER_FILES} DESTINATION ${BEIGNET_INSTALL_DIR}/include)
endif (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
index b5bec14..9514483 100644
--- a/backend/src/GBEConfig.h.in
+++ b/backend/src/GBEConfig.h.in
@@ -6,3 +6,5 @@
#define OCL_BITCODE_BIN "@OCL_BITCODE_BIN@"
#define OCL_HEADER_DIR "@OCL_HEADER_DIR@"
#define OCL_PCH_OBJECT "@OCL_PCH_OBJECT@"
+#define OCL_BITCODE_BIN_20 "@OCL_BITCODE_BIN_20@"
+#define OCL_PCH_OBJECT_20 "@OCL_PCH_OBJECT_20@"
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 675dc78..e9ddd17 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -345,7 +345,7 @@ namespace gbe
Context::Context(const ir::Unit &unit, const std::string &name) :
unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL), useDWLabel(false)
{
- GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
+ GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS || unit.getPointerSize() == ir::POINTER_64_BITS);
this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn), true);
this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
// r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
@@ -393,6 +393,7 @@ namespace gbe
if(this->kernel != NULL) {
this->kernel->scratchSize = this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
this->kernel->ctx = this;
+ this->kernel->setUseDeviceEnqueue(fn.getUseDeviceEnqueue());
}
return this->kernel;
}
@@ -471,6 +472,7 @@ namespace gbe
kernel->args[argID].info.accessQual = arg.info.accessQual;
kernel->args[argID].info.typeQual = arg.info.typeQual;
kernel->args[argID].info.argName = arg.info.argName;
+ kernel->args[argID].info.typeSize = arg.info.typeSize;
switch (arg.type) {
case ir::FunctionArgument::VALUE:
case ir::FunctionArgument::STRUCTURE:
@@ -498,6 +500,11 @@ namespace gbe
kernel->args[argID].type = GBE_ARG_SAMPLER;
kernel->args[argID].size = sizeof(void*);
break;
+ case ir::FunctionArgument::PIPE:
+ kernel->args[argID].type = GBE_ARG_PIPE;
+ kernel->args[argID].size = sizeof(void*);
+ kernel->args[argID].bti = arg.bti;
+ break;
}
}
}
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 5653275..56fda89 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -50,6 +50,7 @@
#include "backend/gen_defs.hpp"
#include "backend/gen7_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
#include "src/cl_device_data.h"
static const struct {
@@ -70,6 +71,7 @@ static const struct {
[GEN_OPCODE_CBIT] = { .name = "cbit", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_BFREV] = { .name = "bfrev", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
@@ -103,6 +105,7 @@ static const struct {
[GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -495,6 +498,24 @@ static const char *data_port1_data_cache_msg_type[] = {
[13] = "Typed Surface Write",
};
+static const char *atomic_opration_type[] = {
+ [1] = "and",
+ [2] = "or",
+ [3] = "xor",
+ [4] = "xchg",
+ [5] = "inc",
+ [6] = "dec",
+ [7] = "add",
+ [8] = "sub",
+ [9] = "rsub",
+ [10] = "imax",
+ [11] = "imin",
+ [12] = "umax",
+ [13] = "umin",
+ [14] = "cmpxchg",
+ [15] = "invalid"
+};
+
static int column;
static int gen_version;
@@ -573,6 +594,7 @@ static int gen_version;
#define UNTYPED_RW_MSG_TYPE(inst) GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.msg_type)
#define BYTE_RW_SIMD_MODE(inst) GEN_BITS_FIELD(inst, bits3.gen7_byte_rw.simd_mode)
#define BYTE_RW_DATA_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_byte_rw.data_size)
+#define UNTYPED_RW_AOP_TYPE(inst) GEN_BITS_FIELD2(inst, bits3.gen7_atomic_op.aop_type, bits3.gen8_atomic_a64.aop_type)
#define SCRATCH_RW_OFFSET(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.offset)
#define SCRATCH_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.block_size)
#define SCRATCH_RW_INVALIDATE_AFTER_READ(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.invalidate_after_read)
@@ -1391,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
}
} else if (OPCODE(inst) != GEN_OPCODE_SEND &&
- OPCODE(inst) != GEN_OPCODE_SENDC) {
+ OPCODE(inst) != GEN_OPCODE_SENDC &&
+ OPCODE(inst) != GEN_OPCODE_SENDS) {
err |= control(file, "conditional modifier", conditional_modifier,
COND_DST_OR_MODIFIER(inst), NULL);
if (COND_DST_OR_MODIFIER(inst))
@@ -1406,7 +1429,20 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
string(file, ")");
}
- if (opcode[OPCODE(inst)].nsrc == 3) {
+ if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+ const union Gen9NativeInstruction *gen9_insn = (const union Gen9NativeInstruction *)inst;
+ pad(file, 16);
+ if (gen9_insn->bits1.sends.dest_reg_file_0 == 0)
+ reg(file, GEN_ARCHITECTURE_REGISTER_FILE, gen9_insn->bits1.sends.dest_reg_nr);
+ else
+ format(file, "g%d", gen9_insn->bits1.sends.dest_reg_nr);
+ pad(file, 32);
+ format(file, "g%d(addLen:%d)", gen9_insn->bits2.sends.src0_reg_nr, GENERIC_MSG_LENGTH(inst));
+ pad(file, 48);
+ format(file, "g%d(dataLen:%d)", gen9_insn->bits1.sends.src1_reg_nr, gen9_insn->bits2.sends.src1_length);
+ pad(file, 64);
+ format(file, "0x%08x", gen9_insn->bits3.ud);
+ } else if (opcode[OPCODE(inst)].nsrc == 3) {
pad(file, 16);
err |= dest_3src(file, inst);
@@ -1449,7 +1485,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
}
if (OPCODE(inst) == GEN_OPCODE_SEND ||
- OPCODE(inst) == GEN_OPCODE_SENDC) {
+ OPCODE(inst) == GEN_OPCODE_SENDC ||
+ OPCODE(inst) == GEN_OPCODE_SENDS) {
enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
newline(file);
@@ -1464,7 +1501,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
target, &space);
}
- if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+ int immbti = 0;
+ if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+ const union Gen9NativeInstruction *gen9_insn = (const union Gen9NativeInstruction *)inst;
+ immbti = !(gen9_insn->bits2.sends.sel_reg32_desc);
+ } else
+ immbti = (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE);
+ if (immbti) {
switch (target) {
case GEN_SFID_VIDEO_MOTION_EST:
format(file, " (bti: %d, msg_type: %d)",
@@ -1509,6 +1552,14 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)],
data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+ else if(UNTYPED_RW_MSG_TYPE(inst) == 6)
+ format(file, " (bti: %d, rgba: %d, %s, %s, %s, %s)",
+ UNTYPED_RW_BTI(inst),
+ UNTYPED_RW_RGBA(inst),
+ data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+ data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+ data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)],
+ atomic_opration_type[UNTYPED_RW_AOP_TYPE(inst)]);
else
format(file, " not implemented");
} else {
@@ -1526,13 +1577,21 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
UNTYPED_RW_BTI(inst),
data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+ else if(UNTYPED_RW_MSG_TYPE(inst) == 2)
+ format(file, " (bti: %d, rgba: %d, %s, %s, %s, %s)",
+ UNTYPED_RW_BTI(inst),
+ UNTYPED_RW_RGBA(inst),
+ data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+ data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+ data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)],
+ atomic_opration_type[UNTYPED_RW_AOP_TYPE(inst)]);
else
format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
- UNTYPED_RW_BTI(inst),
- UNTYPED_RW_RGBA(inst),
- data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
- data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
- data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+ UNTYPED_RW_BTI(inst),
+ UNTYPED_RW_RGBA(inst),
+ data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+ data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+ data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
break;
case GEN_SFID_DATAPORT_CONSTANT:
format(file, " (bti: %d, %s)",
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index fc37991..b82cc43 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -126,7 +126,7 @@ namespace gbe
return gen7_insn->bits3.ud;
}
- void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+ void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
@@ -199,7 +199,7 @@ namespace gbe
return insn->bits3.ud;
}
- void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+ void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
index d06f393..8877a50 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -42,9 +42,9 @@ namespace gbe
virtual void JMPI(GenRegister src, bool longjmp = false);
/*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
- virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum, bool useSends);
virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
- virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
+ virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
virtual void setHeader(GenNativeInstruction *insn);
virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index 4f35491..4b2cd9a 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -280,7 +280,7 @@ namespace gbe
response_length);
}
- void Gen7Encoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+ void Gen7Encoder::MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t size, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1 + size;
const uint32_t response_length = 0; // Size of registers
diff --git a/backend/src/backend/gen7_encoder.hpp b/backend/src/backend/gen7_encoder.hpp
index edb711d..7585b34 100644
--- a/backend/src/backend/gen7_encoder.hpp
+++ b/backend/src/backend/gen7_encoder.hpp
@@ -45,7 +45,7 @@ namespace gbe
/*! MBlock read */
virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
/*! MBlock write */
- virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t elemSize, bool useSends);
};
}
#endif /* __GBE_GEN7_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 09b38b2..34baee8 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -900,6 +900,32 @@ namespace gbe
p->pop();
}
}
+ void Gen8Context::emitUntypedReadA64Instruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t elemNum = insn.extra.elem;
+ p->UNTYPED_READA64(dst, src, elemNum);
+ }
+
+ void Gen8Context::emitUntypedWriteA64Instruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t elemNum = insn.extra.elem;
+ p->UNTYPED_WRITEA64(src, elemNum);
+ }
+
+ void Gen8Context::emitByteGatherA64Instruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t elemSize = insn.extra.elem;
+ p->BYTE_GATHERA64(dst, src, elemSize);
+ }
+
+ void Gen8Context::emitByteScatterA64Instruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t elemSize = insn.extra.elem;
+ p->BYTE_SCATTERA64(src, elemSize);
+ }
+
void Gen8Context::emitRead64Instruction(const SelectionInstruction &insn)
{
const uint32_t elemNum = insn.extra.elem;
@@ -942,6 +968,7 @@ namespace gbe
GBE_ASSERT(elemNum == 1);
const GenRegister addr = ra->genReg(insn.src(elemNum));
const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+ GenRegister data = ra->genReg(insn.src(elemNum+1));
/* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -952,11 +979,15 @@ namespace gbe
}
if (bti.file == GEN_IMMEDIATE_VALUE) {
- p->UNTYPED_WRITE(addr, bti, elemNum*2);
+ p->UNTYPED_WRITE(addr, data, bti, elemNum*2, insn.extra.splitSend);
} else {
const GenRegister tmp = ra->genReg(insn.dst(elemNum));
const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
- unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+ unsigned desc = 0;
+ if (insn.extra.splitSend)
+ desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum*2);
+ else
+ desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -964,11 +995,56 @@ namespace gbe
p->push();
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(insn.state.flag, insn.state.subFlag);
- p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+ p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2, insn.extra.splitSend);
p->pop();
afterMessage(insn, bti, tmp, btiTmp, jip0);
}
}
+ void Gen8Context::emitRead64A64Instruction(const SelectionInstruction &insn) {
+ const uint32_t elemNum = insn.extra.elem;
+ GBE_ASSERT(elemNum == 1);
+
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+
+ /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
+ which we can not accept. We just fallback to 2 DW untyperead here. */
+ p->UNTYPED_READA64(dst, src, 2*elemNum);
+
+ for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+ GenRegister long_tmp = ra->genReg(insn.dst(elemID));
+ GenRegister the_long = ra->genReg(insn.dst(elemID + elemNum));
+ this->packLongVec(long_tmp, the_long, p->curr.execWidth);
+ }
+ }
+
+ void Gen8Context::emitWrite64A64Instruction(const SelectionInstruction &insn)
+ {
+ const uint32_t elemNum = insn.extra.elem;
+ GBE_ASSERT(elemNum == 1);
+ const GenRegister addr = ra->genReg(insn.src(elemNum));
+
+ /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
+ which we can not accept. We just fallback to 2 DW untypewrite here. */
+ for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+ GenRegister the_long = ra->genReg(insn.src(elemID));
+ GenRegister long_tmp = ra->genReg(insn.src(elemNum + 1 + elemID));
+ this->unpackLongVec(the_long, long_tmp, p->curr.execWidth);
+ }
+
+ p->UNTYPED_WRITEA64(addr, elemNum*2);
+ }
+ void Gen8Context::emitAtomicA64Instruction(const SelectionInstruction &insn)
+ {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const uint32_t function = insn.extra.function;
+ unsigned srcNum = insn.extra.elem;
+ const GenRegister bti = ra->genReg(insn.src(srcNum));
+ GBE_ASSERT(bti.value.ud == 0xff);
+ p->ATOMICA64(dst, function, src, bti, srcNum);
+ }
+
void Gen8Context::emitPackLongInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
const GenRegister dst = ra->genReg(insn.dst(0));
@@ -983,7 +1059,7 @@ namespace gbe
const GenRegister dst = ra->genReg(insn.dst(0));
/* Scalar register need not to convert. */
- GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0 && src.hstride != GEN_HORIZONTAL_STRIDE_0);
+ GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0);
this->unpackLongVec(src, dst, p->curr.execWidth);
}
@@ -1280,7 +1356,7 @@ namespace gbe
nextDst = GenRegister::Qn(tempDst, 1);
p->MOV(nextDst, nextSrc);
p->pop();
- p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+ p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
p->push();
@@ -1296,7 +1372,7 @@ namespace gbe
nextDst = GenRegister::Qn(tempDst, 1);
p->MOV(nextDst, nextSrc);
p->pop();
- p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+ p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
}
@@ -1317,6 +1393,67 @@ namespace gbe
p->pop();
}
+ void ChvContext::emitStackPointer(void) {
+ using namespace ir;
+
+ // Only emit stack pointer computation if we use a stack
+ if (kernel->getStackSize() == 0)
+ return;
+
+ // Check that everything is consistent in the kernel code
+ const uint32_t perLaneSize = kernel->getStackSize();
+ GBE_ASSERT(perLaneSize > 0);
+
+ const GenRegister selStatckPtr = this->simdWidth == 8 ?
+ GenRegister::ud8grf(ir::ocl::stackptr) :
+ GenRegister::ud16grf(ir::ocl::stackptr);
+ const GenRegister stackptr = ra->genReg(selStatckPtr);
+ // borrow block ip as temporary register as we will
+ // initialize block ip latter.
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+ const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+
+ loadLaneID(stackptr);
+
+ // We compute the per-lane stack pointer here
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
+ // let private address start from zero
+ //p->MOV(stackptr, GenRegister::immud(0));
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
+ p->curr.execWidth = this->simdWidth;
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
+ p->curr.execWidth = 1;
+ p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+ p->curr.execWidth = this->simdWidth;
+ p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+ if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+ const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+ GenRegister::ul8grf(ir::ocl::stackptr) :
+ GenRegister::ul16grf(ir::ocl::stackptr);
+ GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+ GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+ int simdWidth = p->curr.execWidth;
+ if (simdWidth == 16) {
+ // we need do second quarter first, because the dst type is QW,
+ // while the src is DW. If we do first quater first, the 1st
+ // quarter's dst would contain the 2nd quarter's src.
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+ p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+ }
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MOV(sp, stackptr);
+ p->MOV(stackptr2, sp);
+ }
+ p->pop();
+ }
+
/* Init value according to WORKGROUP OP
* Emit assert is invalid combination operation - datatype */
static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
@@ -1351,6 +1488,10 @@ namespace gbe
p->MOV(dataReg, GenRegister::immint64(0x0));
else if (dataReg.type == GEN_TYPE_UL)
p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else if (dataReg.type == GEN_TYPE_W)
+ p->MOV(dataReg, GenRegister::immw(0x0));
+ else if (dataReg.type == GEN_TYPE_UW)
+ p->MOV(dataReg, GenRegister::immuw(0x0));
else
GBE_ASSERT(0); /* unsupported data-type */
}
@@ -1371,6 +1512,10 @@ namespace gbe
p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
else if (dataReg.type == GEN_TYPE_UL)
p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+ else if (dataReg.type == GEN_TYPE_W)
+ p->MOV(dataReg, GenRegister::immw(0x7FFF));
+ else if (dataReg.type == GEN_TYPE_UW)
+ p->MOV(dataReg, GenRegister::immuw(0xFFFF));
else
GBE_ASSERT(0); /* unsupported data-type */
}
@@ -1391,6 +1536,10 @@ namespace gbe
p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
else if (dataReg.type == GEN_TYPE_UL)
p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else if (dataReg.type == GEN_TYPE_W)
+ p->MOV(dataReg, GenRegister::immw(0x8000));
+ else if (dataReg.type == GEN_TYPE_UW)
+ p->MOV(dataReg, GenRegister::immuw(0x0));
else
GBE_ASSERT(0); /* unsupported data-type */
}
@@ -1650,7 +1799,7 @@ namespace gbe
GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
GenRegister localBarrier = ra->genReg(insn.src(5));
- uint32_t wg_op = insn.extra.workgroupOp;
+ uint32_t wg_op = insn.extra.wgop.workgroupOp;
uint32_t simd = p->curr.execWidth;
int32_t jip0, jip1;
@@ -1669,8 +1818,8 @@ namespace gbe
/* use of continuous GRF allocation from insn selection */
GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
- GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
- GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+ GenRegister msgAddr = GenRegister::retype(msg, GEN_TYPE_UD);
+ GenRegister msgData = GenRegister::retype(ra->genReg(insn.dst(3)), dst.type);
/* do some calculation within each thread */
wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
@@ -1705,13 +1854,15 @@ namespace gbe
{
GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
- p->MOV(msgData.offset(msgData, 0), threadDataL);
- p->MOV(msgData.offset(msgData, 1), threadDataH);
-
+ GenRegister msgDataL = GenRegister::retype(msgData, GEN_TYPE_D);
+ GenRegister msgDataH = msgDataL.offset(msgDataL, 1);
p->curr.execWidth = 8;
+ p->MOV(msgDataL, threadDataL);
+ p->MOV(msgDataH, threadDataH);
+
p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
p->ADD(msgAddr, msgAddr, msgSlmOff);
- p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+ p->UNTYPED_WRITE(msgAddr, msgData, GenRegister::immw(0xFE), 2, insn.extra.wgop.splitSend);
}
else
{
@@ -1719,7 +1870,7 @@ namespace gbe
p->MOV(msgData, threadData);
p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
p->ADD(msgAddr, msgAddr, msgSlmOff);
- p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+ p->UNTYPED_WRITE(msgAddr, msgData, GenRegister::immw(0xFE), 1, insn.extra.wgop.splitSend);
}
/* init partialData register, it will hold the final result */
@@ -1804,30 +1955,38 @@ namespace gbe
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
|| wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
{
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
/* workaround QW datatype on CMP */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
- dst.offset(dst, 1, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
- dst.offset(dst, 2, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
- dst.offset(dst, 3, 0), partialData);
- }
+ p->push();
+ p->curr.execWidth = 8;
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+ if (simd == 16) {
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->SEL_CMP(GEN_CONDITIONAL_LE, GenRegister::Qn(dst, 1),
+ GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+ }
+ p->pop();
+ } else
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
}
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
|| wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
/* workaround QW datatype on CMP */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
- dst.offset(dst, 1, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
- dst.offset(dst, 2, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
- dst.offset(dst, 3, 0), partialData);
- }
+ p->push();
+ p->curr.execWidth = 8;
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+ if (simd == 16) {
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->SEL_CMP(GEN_CONDITIONAL_GE, GenRegister::Qn(dst, 1),
+ GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+ }
+ p->pop();
+ } else
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
}
}
@@ -1857,7 +2016,7 @@ namespace gbe
const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(0)), dst.type);
GenRegister threadData = ra->genReg(insn.src(1));
- uint32_t wg_op = insn.extra.workgroupOp;
+ uint32_t wg_op = insn.extra.wgop.workgroupOp;
uint32_t simd = p->curr.execWidth;
/* masked elements should be properly set to init value */
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index ec1358c..6b75540 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -66,8 +66,15 @@ namespace gbe
virtual void emitFloatToI64Instruction(const SelectionInstruction &insn);
virtual void emitI64MADSATInstruction(const SelectionInstruction &insn);
+ virtual void emitUntypedWriteA64Instruction(const SelectionInstruction &insn);
+ virtual void emitUntypedReadA64Instruction(const SelectionInstruction &insn);
+ virtual void emitByteGatherA64Instruction(const SelectionInstruction &insn);
+ virtual void emitByteScatterA64Instruction(const SelectionInstruction &insn);
virtual void emitWrite64Instruction(const SelectionInstruction &insn);
virtual void emitRead64Instruction(const SelectionInstruction &insn);
+ virtual void emitWrite64A64Instruction(const SelectionInstruction &insn);
+ virtual void emitRead64A64Instruction(const SelectionInstruction &insn);
+ virtual void emitAtomicA64Instruction(const SelectionInstruction &insn);
virtual void emitI64MULInstruction(const SelectionInstruction &insn);
virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
@@ -118,6 +125,7 @@ namespace gbe
virtual void newSelection(void);
virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+ virtual void emitStackPointer(void);
};
}
#endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 277260f..a33fbac 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -73,17 +73,36 @@ namespace gbe
Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
setMessageDescriptor(insn, sfid, msg_length, response_length);
- gen8_insn->bits3.gen7_untyped_rw.msg_type = msg_type;
- gen8_insn->bits3.gen7_untyped_rw.bti = bti;
- gen8_insn->bits3.gen7_untyped_rw.rgba = rgba;
+ gen8_insn->bits3.gen8_untyped_rw_a64.msg_type = msg_type;
+ gen8_insn->bits3.gen8_untyped_rw_a64.bti = bti;
+ gen8_insn->bits3.gen8_untyped_rw_a64.rgba = rgba;
if (curr.execWidth == 8)
- gen8_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+ gen8_insn->bits3.gen8_untyped_rw_a64.simd_mode = GEN_UNTYPED_SIMD8;
else if (curr.execWidth == 16)
- gen8_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+ gen8_insn->bits3.gen8_untyped_rw_a64.simd_mode = GEN_UNTYPED_SIMD16;
else
NOT_SUPPORTED;
}
+ static void setDPByteScatterGatherA64(GenEncoder *p,
+ GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t block_size,
+ uint32_t data_size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+ Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ gen8_insn->bits3.gen8_scatter_rw_a64.msg_type = msg_type;
+ gen8_insn->bits3.gen8_scatter_rw_a64.bti = bti;
+ gen8_insn->bits3.gen8_scatter_rw_a64.data_sz = data_size;
+ gen8_insn->bits3.gen8_scatter_rw_a64.block_sz = block_size;
+ GBE_ASSERT(p->curr.execWidth == 8);
+ }
+
void Gen8Encoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
unsigned char msg_type, uint32_t msg_length, bool header_present)
{
@@ -134,7 +153,7 @@ namespace gbe
return gen8_insn->bits3.ud;
}
- void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+ void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
@@ -150,6 +169,48 @@ namespace gbe
this->setSrc1(insn, bti);
}
}
+
+ unsigned Gen8Encoder::setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long) {
+ Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ assert(srcNum <= 3);
+
+ if (this->curr.execWidth == 8) {
+ msg_length = srcNum + 1 + type_long;
+ if(srcNum == 3 && type_long)
+ msg_length++;
+ response_length = 1 + type_long;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2 * (srcNum + 1);
+ response_length = 2;
+ } else
+ NOT_IMPLEMENTED;
+
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
+ gen8_insn->bits3.gen8_atomic_a64.msg_type = GEN8_P1_UNTYPED_ATOMIC_A64;
+ gen8_insn->bits3.gen8_atomic_a64.bti = bti;
+ gen8_insn->bits3.gen8_atomic_a64.return_data = 1;
+ gen8_insn->bits3.gen8_atomic_a64.aop_type = function;
+ gen8_insn->bits3.gen8_atomic_a64.data_size = type_long;
+
+ return gen8_insn->bits3.ud;
+ }
+
+ void Gen8Encoder::ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ int type_long = (dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L) ? 1: 0;
+ setAtomicA64MessageDesc(insn, function, bti.value.ud, srcNum, type_long);
+ }
+
unsigned Gen8Encoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
@@ -207,7 +268,7 @@ namespace gbe
return insn->bits3.ud;
}
- void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+ void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
this->setHeader(insn);
@@ -229,6 +290,101 @@ namespace gbe
}
}
+ void Gen8Encoder::UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ assert(this->curr.execWidth == 8);
+
+ if (this->curr.execWidth == 8) {
+ msg_length = 2;
+ response_length = elemNum;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(insn,
+ 255, // stateless bti
+ untypedRWMask[elemNum],
+ GEN8_P1_UNTYPED_READ_A64,
+ msg_length,
+ response_length);
+ }
+
+ void Gen8Encoder::UNTYPED_WRITEA64(GenRegister msg, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ this->setHeader(insn);
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ msg_length = 2 + elemNum;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(insn,
+ 255, //stateless bti
+ untypedRWMask[elemNum],
+ GEN8_P1_UNTYPED_WRITE_A64,
+ msg_length,
+ response_length);
+ }
+
+ void Gen8Encoder::BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+
+ this->setSrc1(insn, GenRegister::immud(0));
+ //setByteGatherMessageDesc(insn, bti.value.ud, elemSize);
+ GBE_ASSERT(this->curr.execWidth == 8);
+ const uint32_t msg_length = 2;
+ const uint32_t response_length = 1;
+ setDPByteScatterGatherA64(this,
+ insn,
+ 0xff,
+ 0x0,
+ elemSize,
+ GEN8_P1_BYTE_GATHER_A64,
+ msg_length,
+ response_length);
+ }
+
+ void Gen8Encoder::BYTE_SCATTERA64(GenRegister msg, uint32_t elemSize) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ // only support simd8
+ GBE_ASSERT(this->curr.execWidth == 8);
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+ this->setSrc1(insn, GenRegister::immud(0));
+ const uint32_t msg_length = 3;
+ const uint32_t response_length = 0;
+ setDPByteScatterGatherA64(this,
+ insn,
+ 0xff,
+ 0x0,
+ elemSize,
+ GEN8_P1_BYTE_SCATTER_A64,
+ msg_length,
+ response_length);
+ }
+
void Gen8Encoder::LOAD_INT64_IMM(GenRegister dest, GenRegister value) {
MOV(dest, value);
}
@@ -275,6 +431,30 @@ namespace gbe
this->setSrc1(&insn, GenRegister::immd(jip*8));
}
+ void Gen8Encoder::FENCE(GenRegister dst, bool flushRWCache) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, dst);
+ setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
+ gen8_insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
+ gen8_insn->bits3.gen7_memory_fence.commit_enable = 0x1;
+ gen8_insn->bits3.gen7_memory_fence.flush_rw = flushRWCache ? 1 : 0;
+ }
+
+ void Gen8Encoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, GenRegister::ud8grf(0,0));
+ unsigned msg_type = GEN_SAMPLER_MESSAGE_CACHE_FLUSH;
+ unsigned simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD32_64;
+ setSamplerMessage(insn, 0, 0, msg_type,
+ 1, 1,
+ true,
+ simd_mode, 0);
+ }
void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
@@ -406,9 +586,10 @@ namespace gbe
assert(gen8_insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
- if (reg.file == GEN_IMMEDIATE_VALUE)
+ if (reg.file == GEN_IMMEDIATE_VALUE) {
+ assert(!((reg.type == GEN_TYPE_L || reg.type == GEN_TYPE_UL || reg.type == GEN_TYPE_DF_IMM) && reg.value.u64 > 0xFFFFFFFFl));
gen8_insn->bits3.ud = reg.value.ud;
- else {
+ } else {
assert (reg.address_mode == GEN_ADDRESS_DIRECT);
if (gen8_insn->header.access_mode == GEN_ALIGN_1) {
gen8_insn->bits3.da1.src1_subreg_nr = reg.subnr;
@@ -637,4 +818,69 @@ namespace gbe
gen8_insn->bits1.da3srcacc.src2_abs = src2.absolute;
gen8_insn->bits1.da3srcacc.src2_negate = src2.negation;
}
+
+ static void setOBlockRWA64(GenEncoder *p,
+ GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+
+ gen8_insn->bits3.gen8_block_rw_a64.msg_type = msg_type;
+ gen8_insn->bits3.gen8_block_rw_a64.bti = bti;
+ // For OWord Block read, we use unaligned read
+ gen8_insn->bits3.gen8_block_rw_a64.msg_sub_type = msg_type == GEN8_P1_BLOCK_READ_A64 ? 1 : 0;
+ gen8_insn->bits3.gen8_block_rw_a64.block_size = size;
+ gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
+ }
+
+ void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ const uint32_t msg_length = 1;
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+ const uint32_t response_length = sizeinreg; // Size is in reg
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setOBlockRWA64(this,
+ insn,
+ bti,
+ block_size,
+ GEN8_P1_BLOCK_READ_A64,
+ msg_length,
+ response_length);
+
+ }
+
+ void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
+ const uint32_t response_length = 0;
+ const uint32_t block_size = getOBlockSize(ow_size);
+
+ this->setHeader(insn);
+ this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ setOBlockRWA64(this,
+ insn,
+ bti,
+ block_size,
+ GEN8_P1_BLOCK_WRITE_A64,
+ msg_length,
+ response_length);
+ }
} /* End of the name space. */
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index 12b3765..fa62a8d 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -38,20 +38,27 @@ namespace gbe
/*! Jump indexed instruction */
virtual void JMPI(GenRegister src, bool longjmp = false);
+ virtual void FENCE(GenRegister dst, bool flushRWCache);
/*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
virtual void F16TO32(GenRegister dest, GenRegister src0);
virtual void F32TO16(GenRegister dest, GenRegister src0);
virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
- virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+ virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
- virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
+ virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
+ virtual void UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum);
+ virtual void UNTYPED_WRITEA64(GenRegister src, uint32_t elemNum);
+ virtual void BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize);
+ virtual void BYTE_SCATTERA64(GenRegister src, uint32_t elemSize);
virtual void setHeader(GenNativeInstruction *insn);
virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
unsigned char msg_type, uint32_t msg_length,
bool header_present);
+ virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
@@ -62,6 +69,7 @@ namespace gbe
GenRegister src1 = GenRegister::null());
virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+ virtual unsigned setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long);
virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
void setSrc0WithAcc(GenNativeInstruction *insn, GenRegister reg, uint32_t accN);
@@ -71,6 +79,10 @@ namespace gbe
uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc);
void MADM(GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2,
uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t src2Acc);
+ /*! A64 OBlock read */
+ virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ /*! A64 OBlock write */
+ virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
};
}
#endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index 549948a..446e7f9 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -540,7 +540,11 @@ union Gen8NativeInstruction
/*! Memory fence */
struct {
uint32_t bti:8;
- uint32_t pad:5;
+ uint32_t pad:1;
+ uint32_t flush_instruction:1;
+ uint32_t flush_texture:1;
+ uint32_t flush_constant:1;
+ uint32_t flush_rw:1;
uint32_t commit_enable:1;
uint32_t msg_type:4;
uint32_t pad2:1;
@@ -566,6 +570,46 @@ union Gen8NativeInstruction
uint32_t end_of_thread:1;
} gen7_atomic_op;
+ /*! atomic a64 messages */
+ struct {
+ uint32_t bti:8;
+ uint32_t aop_type:4;
+ uint32_t data_size:1;
+ uint32_t return_data:1;
+ uint32_t msg_type:5;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad3:2;
+ uint32_t end_of_thread:1;
+ } gen8_atomic_a64;
+
+ // gen8 untyped read/write
+ struct {
+ uint32_t bti:8;
+ uint32_t rgba:4;
+ uint32_t simd_mode:2;
+ uint32_t msg_type:5;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen8_untyped_rw_a64;
+
+ struct {
+ uint32_t bti:8;
+ uint32_t block_sz:2; // 00 byte 01 dword
+ uint32_t data_sz:2; // 0 ->1block 1->2block
+ uint32_t ignored:2;
+ uint32_t msg_type:5; // 10000 scatter read, 11010 scatter write 11001 a64 untyped write
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen8_scatter_rw_a64;
+
struct {
uint32_t src1_subreg_nr_high:1;
uint32_t src1_reg_nr:8;
@@ -604,6 +648,19 @@ union Gen8NativeInstruction
uint32_t end_of_thread:1;
} gen7_msg_gw;
+ struct {
+ uint32_t bti:8;
+ uint32_t block_size:3; // oword size
+ uint32_t msg_sub_type:2; // 00 OWord block R/W 01 Unaligned OWord block read 10 Oword Dual Block R/W 11 HWord Block R/W
+ uint32_t ignored:1;
+ uint32_t msg_type:5; // 10100 A64 block read, 10101 A64 block write
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen8_block_rw_a64;
+
struct {
uint32_t jip:32;
} gen8_branch;
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index dc05756..483b2c3 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -22,6 +22,7 @@
#include "backend/gen9_context.hpp"
#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_program.hpp"
namespace gbe
{
@@ -34,9 +35,10 @@ namespace gbe
const GenRegister fenceDst = ra->genReg(insn.dst(0));
uint32_t barrierType = insn.extra.barrierType;
const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+ bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
- if (barrierType == ir::syncGlobalBarrier) {
- p->FENCE(fenceDst);
+ if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+ p->FENCE(fenceDst, imageFence);
p->MOV(fenceDst, fenceDst);
}
p->push();
@@ -54,6 +56,10 @@ namespace gbe
p->curr.predicate = GEN_PREDICATE_NONE;
p->WAIT();
p->pop();
+ if (imageFence) {
+ p->FLUSH_SAMPLERCACHE(fenceDst);
+ p->MOV(fenceDst, fenceDst);
+ }
}
void BxtContext::newSelection(void) {
@@ -165,6 +171,67 @@ namespace gbe
p->pop();
}
+ void BxtContext::emitStackPointer(void) {
+ using namespace ir;
+
+ // Only emit stack pointer computation if we use a stack
+ if (kernel->getStackSize() == 0)
+ return;
+
+ // Check that everything is consistent in the kernel code
+ const uint32_t perLaneSize = kernel->getStackSize();
+ GBE_ASSERT(perLaneSize > 0);
+
+ const GenRegister selStatckPtr = this->simdWidth == 8 ?
+ GenRegister::ud8grf(ir::ocl::stackptr) :
+ GenRegister::ud16grf(ir::ocl::stackptr);
+ const GenRegister stackptr = ra->genReg(selStatckPtr);
+ // borrow block ip as temporary register as we will
+ // initialize block ip latter.
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+ const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+
+ loadLaneID(stackptr);
+
+ // We compute the per-lane stack pointer here
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
+ // let private address start from zero
+ //p->MOV(stackptr, GenRegister::immud(0));
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
+ p->curr.execWidth = this->simdWidth;
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
+ p->curr.execWidth = 1;
+ p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+ p->curr.execWidth = this->simdWidth;
+ p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+ if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+ const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+ GenRegister::ul8grf(ir::ocl::stackptr) :
+ GenRegister::ul16grf(ir::ocl::stackptr);
+ GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+ GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+ int simdWidth = p->curr.execWidth;
+ if (simdWidth == 16) {
+ // we need do second quarter first, because the dst type is QW,
+ // while the src is DW. If we do first quater first, the 1st
+ // quarter's dst would contain the 2nd quarter's src.
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+ p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+ }
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MOV(sp, stackptr);
+ p->MOV(stackptr2, sp);
+ }
+ p->pop();
+ }
+
void KblContext::newSelection(void) {
this->sel = GBE_NEW(SelectionKbl, *this);
}
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 2f24b56..9977e9a 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -67,6 +67,7 @@ namespace gbe
virtual void newSelection(void);
virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+ virtual void emitStackPointer(void);
};
/* This class is used to implement the kabylake
specific logic for context. */
diff --git a/backend/src/backend/gen9_encoder.cpp b/backend/src/backend/gen9_encoder.cpp
index 80df50d..b37fd98 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -26,6 +26,14 @@
**********************************************************************/
#include "backend/gen9_encoder.hpp"
+#include "backend/gen9_instruction.hpp"
+static const uint32_t untypedRWMask[] = {
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
+ GEN_UNTYPED_ALPHA,
+ 0
+};
namespace gbe
{
@@ -60,9 +68,237 @@ namespace gbe
this->setHeader(insn);
this->setDst(insn, dest);
this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
simd_mode, return_format);
}
+
+ void Gen9Encoder::setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1)
+ {
+ assert(dst.subnr == 0 && src0.subnr == 0 && src1.subnr == 0);
+
+ if (dst.file == GEN_ARCHITECTURE_REGISTER_FILE)
+ gen9_insn->bits1.sends.dest_reg_file_0 = 0;
+ else if (dst.file == GEN_GENERAL_REGISTER_FILE)
+ gen9_insn->bits1.sends.dest_reg_file_0 = 1;
+ else
+ NOT_SUPPORTED;
+
+ gen9_insn->bits1.sends.src1_reg_file_0 = 1;
+ gen9_insn->bits1.sends.src1_reg_nr = src1.nr;
+ gen9_insn->bits1.sends.dest_subreg_nr = 0;
+ gen9_insn->bits1.sends.dest_reg_nr = dst.nr;
+ gen9_insn->bits1.sends.dest_address_mode = 0; //direct mode
+ gen9_insn->bits2.sends.src0_subreg_nr = 0;
+ gen9_insn->bits2.sends.src0_reg_nr = src0.nr;
+ gen9_insn->bits2.sends.src0_address_mode = 0;
+ }
+
+ unsigned Gen9Encoder::setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum)
+ {
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ if (this->curr.execWidth == 8) {
+ msg_length = 1;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2;
+ }
+ else
+ NOT_IMPLEMENTED;
+ setDPUntypedRW(insn,
+ bti,
+ untypedRWMask[elemNum],
+ GEN75_P1_UNTYPED_SURFACE_WRITE,
+ msg_length,
+ response_length);
+ return insn->bits3.ud;
+ }
+
+ void Gen9Encoder::UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends)
+ {
+ if (!useSends)
+ Gen8Encoder::UNTYPED_WRITE(addr, data, bti, elemNum, false);
+ else {
+ GBE_ASSERT(addr.reg() != data.reg());
+
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+ Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+ assert(elemNum >= 1 || elemNum <= 4);
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ setSendsOperands(gen9_insn, GenRegister::null(), addr, data);
+ if (this->curr.execWidth == 8)
+ gen9_insn->bits2.sends.src1_length = elemNum;
+ else if (this->curr.execWidth == 16)
+ gen9_insn->bits2.sends.src1_length = 2 * elemNum;
+ else
+ NOT_SUPPORTED;
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ gen9_insn->bits2.sends.sel_reg32_desc = 0;
+ setUntypedWriteSendsMessageDesc(insn, bti.value.ud, elemNum);
+ } else
+ gen9_insn->bits2.sends.sel_reg32_desc = 1;
+ }
+ }
+
+ void Gen9Encoder::TYPED_WRITE(GenRegister header, GenRegister data, bool header_present, unsigned char bti, bool useSends)
+ {
+ if (!useSends)
+ Gen8Encoder::TYPED_WRITE(header, data, header_present, bti, false);
+ else {
+ GBE_ASSERT(header.reg() != data.reg());
+
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+ Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+ assert(header_present);
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+ gen9_insn->bits2.sends.src1_length = 4; //src0_length: 5(header+u+v+w+lod), src1_length: 4(data)
+
+ gen9_insn->bits2.sends.sel_reg32_desc = 0;
+ setTypedWriteMessage(insn, bti, GEN_TYPED_WRITE, 5, header_present);
+ }
+ }
+
+ unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize)
+ {
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ if (this->curr.execWidth == 8) {
+ msg_length = 1;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2;
+ } else
+ NOT_IMPLEMENTED;
+
+ setDPByteScatterGather(insn,
+ bti,
+ elemSize,
+ GEN7_BYTE_SCATTER,
+ msg_length,
+ response_length);
+ return insn->bits3.ud;
+ }
+
+ void Gen9Encoder::BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends)
+ {
+ if (!useSends)
+ Gen8Encoder::BYTE_SCATTER(addr, data, bti, elemSize, false);
+ else {
+ GBE_ASSERT(addr.reg() != data.reg());
+
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+ Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ setSendsOperands(gen9_insn, GenRegister::null(), addr, data);
+ if (this->curr.execWidth == 8)
+ gen9_insn->bits2.sends.src1_length = 1;
+ else if (this->curr.execWidth == 16)
+ gen9_insn->bits2.sends.src1_length = 2;
+ else
+ NOT_SUPPORTED;
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ gen9_insn->bits2.sends.sel_reg32_desc = 0;
+ setByteScatterSendsMessageDesc(insn, bti.value.ud, elemSize);
+ } else
+ gen9_insn->bits2.sends.sel_reg32_desc = 1;
+ }
+ }
+
+ void Gen9Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends)
+ {
+ if (!useSends)
+ Gen8Encoder::ATOMIC(dst, function, addr, data, bti, srcNum, false);
+ else {
+ GBE_ASSERT(addr.reg() != data.reg());
+
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+ Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ setSendsOperands(gen9_insn, dst, addr, data);
+ if (this->curr.execWidth == 8)
+ gen9_insn->bits2.sends.src1_length = srcNum - 1;
+ else if (this->curr.execWidth == 16)
+ gen9_insn->bits2.sends.src1_length = 2 * (srcNum - 1);
+ else
+ NOT_SUPPORTED;
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ gen9_insn->bits2.sends.sel_reg32_desc = 0;
+ setAtomicMessageDesc(insn, function, bti.value.ud, 1);
+ } else
+ gen9_insn->bits2.sends.sel_reg32_desc = 1;
+ }
+ }
+
+ void Gen9Encoder::OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends)
+ {
+ if (!useSends)
+ Gen8Encoder::OBWRITE(header, data, bti, ow_size, false);
+ else {
+ GBE_ASSERT(data.reg() != header.reg());
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+ Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+
+ uint32_t dataRegs = ow_size / 2;
+ // half reg should also have size 1
+ if (dataRegs == 0)
+ dataRegs = 1;
+ gen9_insn->bits2.sends.src1_length = dataRegs;
+
+ const uint32_t block_size = getOBlockSize(ow_size);
+ const uint32_t msg_length = 1;
+ const uint32_t response_length = 0;
+ setOBlockRW(insn,
+ bti,
+ block_size,
+ GEN7_OBLOCK_WRITE,
+ msg_length,
+ response_length);
+ }
+ }
+
+ void Gen9Encoder::MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends)
+ {
+ if (!useSends)
+ Gen8Encoder::MBWRITE(header, data, bti, data_size, false);
+ else {
+ GBE_ASSERT(data.reg() != header.reg());
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+ Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+ gen9_insn->bits2.sends.src1_length = data_size;
+
+ const uint32_t msg_length = 1;
+ const uint32_t response_length = 0;
+ setMBlockRW(insn,
+ bti,
+ GEN75_P1_MEDIA_TYPED_BWRITE,
+ msg_length,
+ response_length);
+ }
+ }
} /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp b/backend/src/backend/gen9_encoder.hpp
index 319e871..2eaa538 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -47,7 +47,15 @@ namespace gbe
uint32_t return_format,
bool isLD,
bool isUniform);
-
+ void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1);
+ virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
+ virtual void TYPED_WRITE(GenRegister header, GenRegister data, bool header_present, unsigned char bti, bool useSends);
+ virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+ virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends);
+ virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+ virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends);
+ virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends);
};
}
#endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen9_instruction.hpp b/backend/src/backend/gen9_instruction.hpp
new file mode 100644
index 0000000..16114ca
--- /dev/null
+++ b/backend/src/backend/gen9_instruction.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo, Yejun <yejun.guo at intel.com>
+ */
+
+
+#ifndef __GEN9_INSTRUCTION_HPP__
+#define __GEN9_INSTRUCTION_HPP__
+
+union Gen9NativeInstruction
+{
+ struct {
+ struct {
+ uint32_t opcode:7;
+ uint32_t pad:1;
+ uint32_t access_mode:1;
+ uint32_t dependency_control:2;
+ uint32_t nib_ctrl:1;
+ uint32_t quarter_control:2;
+ uint32_t thread_control:2;
+ uint32_t predicate_control:4;
+ uint32_t predicate_inverse:1;
+ uint32_t execution_size:3;
+ uint32_t destreg_or_condmod:4;
+ uint32_t acc_wr_control:1;
+ uint32_t cmpt_control:1;
+ uint32_t debug_control:1;
+ uint32_t saturate:1;
+ } header;
+
+ union {
+ struct {
+ uint32_t flag_sub_reg_nr:1;
+ uint32_t flag_reg_nr:1;
+ uint32_t mask_control:1;
+ uint32_t dest_reg_file_0:1;
+ uint32_t src1_reg_file_0:1;
+ uint32_t dest_reg_type:4;
+ uint32_t pad0:3;
+ uint32_t src1_reg_nr:8;
+ uint32_t dest_subreg_nr:1;
+ uint32_t dest_reg_nr:8;
+ uint32_t pad1:1;
+ uint32_t pad2:1; //direct mode is used
+ uint32_t dest_address_mode:1;
+ } sends;
+
+ uint32_t ud;
+ }bits1;
+
+ union {
+ struct {
+ uint32_t src1_length:4; //exdesc_9_6
+ uint32_t src0_subreg_nr:1;
+ uint32_t src0_reg_nr:8;
+ uint32_t sel_reg32_desc:1;
+ uint32_t pad0:1;
+ uint32_t src0_address_mode:1;
+ uint32_t exdesc_31_16:16;
+ } sends;
+
+ uint32_t ud;
+ } bits2;
+
+ union {
+ uint32_t ud;
+ } bits3;
+ };
+};
+#endif
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4f73237..bb104cf 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -248,6 +248,23 @@ namespace gbe
p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
p->curr.execWidth = this->simdWidth;
p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
+ if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+ const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+ GenRegister::ul8grf(ir::ocl::stackptr) :
+ GenRegister::ul16grf(ir::ocl::stackptr);
+ const GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+ int simdWidth = p->curr.execWidth;
+ if (simdWidth == 16) {
+ // we need do second quarter first, because the dst type is QW,
+ // while the src is DW. If we do first quater first, the 1st
+ // quarter's dst would contain the 2nd quarter's src.
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(stackptr,1));
+ }
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MOV(stackptr2, stackptr);
+ }
p->pop();
}
@@ -274,6 +291,7 @@ namespace gbe
case SEL_OP_F16TO32: p->F16TO32(dst, src); break;
case SEL_OP_F32TO16: p->F32TO16(dst, src); break;
case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src); break;
+ case SEL_OP_BFREV: p->BFREV(dst, src); break;
case SEL_OP_CONVI64_TO_I:
{
p->MOV(dst, src.bottom_half());
@@ -719,7 +737,7 @@ namespace gbe
p->curr.quarterControl = 1;
p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
- p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+ p->MOV(GenRegister::offset(dst, 0, 8 * typeSize(src0.type)), indirect);
} else
NOT_IMPLEMENTED;
p->pop();
@@ -1817,9 +1835,10 @@ namespace gbe
const GenRegister fenceDst = ra->genReg(insn.dst(0));
uint32_t barrierType = insn.extra.barrierType;
const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+ bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
- if (barrierType == ir::syncGlobalBarrier) {
- p->FENCE(fenceDst);
+ if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+ p->FENCE(fenceDst, imageFence);
p->MOV(fenceDst, fenceDst);
}
p->push();
@@ -1837,11 +1856,15 @@ namespace gbe
p->curr.predicate = GEN_PREDICATE_NONE;
p->WAIT();
p->pop();
+ if (imageFence) {
+ p->FLUSH_SAMPLERCACHE(fenceDst);
+ p->MOV(fenceDst, fenceDst);
+ }
}
void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
- p->FENCE(dst);
+ p->FENCE(dst, false);
p->MOV(dst, dst);
}
@@ -1870,26 +1893,34 @@ namespace gbe
}
void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
- const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister addr = ra->genReg(insn.src(0));
const GenRegister dst = ra->genReg(insn.dst(0));
const uint32_t function = insn.extra.function;
unsigned srcNum = insn.extra.elem;
+ GenRegister data = addr;
+ if (srcNum > 1)
+ data = ra->genReg(insn.src(1));
+
const GenRegister bti = ra->genReg(insn.src(srcNum));
if (bti.file == GEN_IMMEDIATE_VALUE) {
- p->ATOMIC(dst, function, src, bti, srcNum);
+ p->ATOMIC(dst, function, addr, data, bti, srcNum, insn.extra.splitSend);
} else {
GenRegister flagTemp = ra->genReg(insn.dst(1));
GenRegister btiTmp = ra->genReg(insn.dst(2));
- unsigned desc = p->generateAtomicMessageDesc(function, 0, srcNum);
+ unsigned desc = 0;
+ if (insn.extra.splitSend)
+ desc = p->generateAtomicMessageDesc(function, 0, 1);
+ else
+ desc = p->generateAtomicMessageDesc(function, 0, srcNum);
unsigned jip0 = beforeMessage(insn, bti, flagTemp, btiTmp, desc);
p->push();
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(insn.state.flag, insn.state.subFlag);
- p->ATOMIC(dst, function, src, GenRegister::addr1(0), srcNum);
+ p->ATOMIC(dst, function, addr, data, GenRegister::addr1(0), srcNum, insn.extra.splitSend);
p->pop();
afterMessage(insn, bti, flagTemp, btiTmp, jip0);
}
@@ -1986,10 +2017,10 @@ namespace gbe
else { //size == 8
payload.type = GEN_TYPE_UD;
GBE_ASSERT(payload.hstride == GEN_HORIZONTAL_STRIDE_1);
- loadBottomHalf(payload, src);
+ loadBottomHalf(payload, src.isdf()? GenRegister::retype(src, GEN_TYPE_UL) : src );
uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
- loadTopHalf(payload, src);
+ loadTopHalf(payload, src.isdf() ? GenRegister::retype(src, GEN_TYPE_UL) : src);
this->scratchWrite(msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
}
p->pop();
@@ -2016,9 +2047,9 @@ namespace gbe
} else {
uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
this->scratchRead(payload, msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
- storeBottomHalf(dst, payload);
+ storeBottomHalf(GenRegister::ul8grf(dst.nr, dst.subnr), payload);
this->scratchRead(payload, msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
- storeTopHalf(dst, payload);
+ storeTopHalf(GenRegister::ul8grf(dst.nr, dst.subnr), payload);
}
p->pop();
}
@@ -2032,8 +2063,8 @@ namespace gbe
if (bti.file == GEN_IMMEDIATE_VALUE) {
p->UNTYPED_READ(dst, src, bti, elemNum);
} else {
- const GenRegister tmp = ra->genReg(insn.dst(elemNum));
- const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
+ const GenRegister tmp = ra->genReg(insn.dst(insn.extra.elem));
+ const GenRegister btiTmp = ra->genReg(insn.dst(insn.extra.elem + 1));
unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -2123,7 +2154,7 @@ namespace gbe
const GenRegister bti = ra->genReg(insn.src(elemNum+1));
if (bti.file == GEN_IMMEDIATE_VALUE) {
- p->UNTYPED_WRITE(src, bti, elemNum*2);
+ p->UNTYPED_WRITE(src, src, bti, elemNum*2, false);
} else {
const GenRegister tmp = ra->genReg(insn.dst(0));
const GenRegister btiTmp = ra->genReg(insn.dst(1));
@@ -2135,22 +2166,27 @@ namespace gbe
p->push();
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(insn.state.flag, insn.state.subFlag);
- p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum*2);
+ p->UNTYPED_WRITE(src, src, GenRegister::addr1(0), elemNum*2, false);
p->pop();
afterMessage(insn, bti, tmp, btiTmp, jip0);
}
}
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister addr = ra->genReg(insn.src(0));
+ GenRegister data = ra->genReg(insn.src(1));
const uint32_t elemNum = insn.extra.elem;
const GenRegister bti = ra->genReg(insn.src(elemNum+1));
if (bti.file == GEN_IMMEDIATE_VALUE) {
- p->UNTYPED_WRITE(src, bti, elemNum);
+ p->UNTYPED_WRITE(addr, data, bti, elemNum, insn.extra.splitSend);
} else {
const GenRegister tmp = ra->genReg(insn.dst(0));
const GenRegister btiTmp = ra->genReg(insn.dst(1));
- unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum);
+ unsigned desc = 0;
+ if (insn.extra.splitSend)
+ desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum);
+ else
+ desc = p->generateUntypedWriteMessageDesc(0, elemNum);
unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -2158,7 +2194,7 @@ namespace gbe
p->push();
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(insn.state.flag, insn.state.subFlag);
- p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum);
+ p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum, insn.extra.splitSend);
p->pop();
afterMessage(insn, bti, tmp, btiTmp, jip0);
}
@@ -2190,16 +2226,21 @@ namespace gbe
}
void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
- const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister addr = ra->genReg(insn.src(0));
+ GenRegister data = ra->genReg(insn.src(1));
const uint32_t elemSize = insn.extra.elem;
const GenRegister bti = ra->genReg(insn.src(2));
if (bti.file == GEN_IMMEDIATE_VALUE) {
- p->BYTE_SCATTER(src, bti, elemSize);
+ p->BYTE_SCATTER(addr, data, bti, elemSize, insn.extra.splitSend);
} else {
const GenRegister tmp = ra->genReg(insn.dst(0));
const GenRegister btiTmp = ra->genReg(insn.dst(1));
- unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
+ unsigned desc = 0;
+ if (insn.extra.splitSend)
+ desc = p->generateByteScatterSendsMessageDesc(0, elemSize);
+ else
+ desc = p->generateByteScatterMessageDesc(0, elemSize);
unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -2207,13 +2248,38 @@ namespace gbe
p->push();
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.useFlag(insn.state.flag, insn.state.subFlag);
- p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
+ p->BYTE_SCATTER(addr, data, GenRegister::addr1(0), elemSize, insn.extra.splitSend);
p->pop();
afterMessage(insn, bti, tmp, btiTmp, jip0);
}
}
+ void GenContext::emitUntypedReadA64Instruction(const SelectionInstruction &insn) {
+ assert(0);
+ }
+
+ void GenContext::emitUntypedWriteA64Instruction(const SelectionInstruction &insn) {
+ assert(0);
+ }
+
+ void GenContext::emitByteGatherA64Instruction(const SelectionInstruction &insn) {
+ assert(0);
+ }
+
+ void GenContext::emitByteScatterA64Instruction(const SelectionInstruction &insn) {
+ assert(0);
+ }
+ void GenContext::emitRead64A64Instruction(const SelectionInstruction &insn) {
+ assert(0);
+ }
+ void GenContext::emitWrite64A64Instruction(const SelectionInstruction &insn) {
+ assert(0);
+ }
+ void GenContext::emitAtomicA64Instruction(const SelectionInstruction &insn) {
+ assert(0);
+ }
+
void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
for(uint32_t i = 0; i < insn.dstNum; i++) {
@@ -2399,8 +2465,9 @@ namespace gbe
void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+ GenRegister data = ra->genReg(insn.src(5));
const uint32_t bti = insn.getbti();
- p->TYPED_WRITE(header, true, bti);
+ p->TYPED_WRITE(header, data, true, bti, insn.extra.typedWriteSplitSend);
}
static void calcGID(GenRegister& reg, GenRegister& tmp, int flag, int subFlag, int dim, GenContext *gc)
@@ -2587,6 +2654,7 @@ namespace gbe
uint32_t tsType = insn.extra.timestampType;
GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ (void) tsType;
GBE_ASSERT(tsType == 1);
GenRegister tmArf = GenRegister::tm0();
GenRegister profilingReg[5];
@@ -2712,6 +2780,7 @@ namespace gbe
GenRegister tmp = ra->genReg(insn.dst(0));
uint32_t profilingType = insn.extra.profilingType;
uint32_t bti = insn.extra.profilingBTI;
+ (void) profilingType;
GBE_ASSERT(profilingType == 1);
GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
@@ -2775,7 +2844,7 @@ namespace gbe
p->pop();
p->curr.useFlag(insn.state.flag, insn.state.subFlag);
p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->ATOMIC(incRes, GEN_ATOMIC_OP_INC, sndMsg, GenRegister::immud(bti), 1);
+ p->ATOMIC(incRes, GEN_ATOMIC_OP_INC, sndMsg, sndMsg, GenRegister::immud(bti), 1, false);
} p->pop();
// Calculate the final addr
@@ -2833,14 +2902,14 @@ namespace gbe
// Write it out.
p->curr.execWidth = 8;
p->curr.noMask = 1;
- p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+ p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
p->ADD(addr, addr, GenRegister::immud(32));
// time stamps
for (int i = 0; i < 3; i++) {
p->curr.execWidth = 8;
p->MOV(data, GenRegister::retype(profilingReg[i], GEN_TYPE_UD));
- p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+ p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
p->ADD(addr, addr, GenRegister::immud(32));
}
} p->pop();
@@ -2878,6 +2947,10 @@ namespace gbe
p->MOV(dataReg, GenRegister::immint64(0x0));
else if (dataReg.type == GEN_TYPE_UL)
p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else if (dataReg.type == GEN_TYPE_W)
+ p->MOV(dataReg, GenRegister::immw(0x0));
+ else if (dataReg.type == GEN_TYPE_UW)
+ p->MOV(dataReg, GenRegister::immuw(0x0));
else
GBE_ASSERT(0); /* unsupported data-type */
}
@@ -2896,6 +2969,10 @@ namespace gbe
p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
else if (dataReg.type == GEN_TYPE_UL)
p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+ else if (dataReg.type == GEN_TYPE_W)
+ p->MOV(dataReg, GenRegister::immw(0x7FFF));
+ else if (dataReg.type == GEN_TYPE_UW)
+ p->MOV(dataReg, GenRegister::immuw(0xFFFF));
else
GBE_ASSERT(0); /* unsupported data-type */
}
@@ -2914,6 +2991,10 @@ namespace gbe
p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
else if (dataReg.type == GEN_TYPE_UL)
p->MOV(dataReg, GenRegister::immuint64(0x0));
+ else if (dataReg.type == GEN_TYPE_W)
+ p->MOV(dataReg, GenRegister::immw(0x8000));
+ else if (dataReg.type == GEN_TYPE_UW)
+ p->MOV(dataReg, GenRegister::immuw(0x0));
else
GBE_ASSERT(0); /* unsupported data-type */
}
@@ -3173,7 +3254,7 @@ namespace gbe
GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
GenRegister localBarrier = ra->genReg(insn.src(5));
- uint32_t wg_op = insn.extra.workgroupOp;
+ uint32_t wg_op = insn.extra.wgop.workgroupOp;
uint32_t simd = p->curr.execWidth;
int32_t jip0, jip1;
@@ -3192,8 +3273,8 @@ namespace gbe
/* use of continuous GRF allocation from insn selection */
GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
- GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
- GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+ GenRegister msgAddr = GenRegister::retype(msg, GEN_TYPE_UD);
+ GenRegister msgData = GenRegister::retype(ra->genReg(insn.dst(3)), dst.type);
/* do some calculation within each thread */
wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
@@ -3228,13 +3309,15 @@ namespace gbe
{
GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
- p->MOV(msgData.offset(msgData, 0), threadDataL);
- p->MOV(msgData.offset(msgData, 1), threadDataH);
-
+ GenRegister msgDataL = GenRegister::retype(msgData, GEN_TYPE_D);
+ GenRegister msgDataH = msgDataL.offset(msgDataL, 1);
p->curr.execWidth = 8;
+ p->MOV(msgDataL, threadDataL);
+ p->MOV(msgDataH, threadDataH);
+
p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
p->ADD(msgAddr, msgAddr, msgSlmOff);
- p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+ p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 2, false);
}
else
{
@@ -3242,7 +3325,7 @@ namespace gbe
p->MOV(msgData, threadData);
p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
p->ADD(msgAddr, msgAddr, msgSlmOff);
- p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+ p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 1, false);
}
/* init partialData register, it will hold the final result */
@@ -3327,30 +3410,38 @@ namespace gbe
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
|| wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
{
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
/* workaround QW datatype on CMP */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
- dst.offset(dst, 1, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
- dst.offset(dst, 2, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
- dst.offset(dst, 3, 0), partialData);
- }
+ p->push();
+ p->curr.execWidth = 8;
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+ if (simd == 16) {
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->SEL_CMP(GEN_CONDITIONAL_LE, GenRegister::Qn(dst, 1),
+ GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+ }
+ p->pop();
+ } else
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
}
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
|| wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
/* workaround QW datatype on CMP */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
- dst.offset(dst, 1, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
- dst.offset(dst, 2, 0), partialData);
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
- dst.offset(dst, 3, 0), partialData);
- }
+ p->push();
+ p->curr.execWidth = 8;
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+ if (simd == 16) {
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ p->SEL_CMP(GEN_CONDITIONAL_GE, GenRegister::Qn(dst, 1),
+ GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+ }
+ p->pop();
+ } else
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
}
}
@@ -3380,7 +3471,7 @@ namespace gbe
const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(0)), dst.type);
GenRegister threadData = ra->genReg(insn.src(1));
- uint32_t wg_op = insn.extra.workgroupOp;
+ uint32_t wg_op = insn.extra.wgop.workgroupOp;
uint32_t simd = p->curr.execWidth;
/* masked elements should be properly set to init value */
@@ -3398,27 +3489,25 @@ namespace gbe
}
void GenContext::emitPrintfLongInstruction(GenRegister& addr, GenRegister& data,
- GenRegister& src, uint32_t bti) {
+ GenRegister& src, uint32_t bti, bool useSends) {
p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.bottom_half());
- p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+ p->UNTYPED_WRITE(addr, data, GenRegister::immud(bti), 1, useSends);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.top_half(this->simdWidth));
- p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+ p->UNTYPED_WRITE(addr, data, GenRegister::immud(bti), 1, useSends);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
}
void GenContext::emitPrintfInstruction(const SelectionInstruction &insn) {
- const GenRegister dst = ra->genReg(insn.dst(0));
- const GenRegister tmp0 = ra->genReg(insn.dst(1));
- const GenRegister tmp1 = ra->genReg(insn.dst(2));
+ const GenRegister tmp0 = ra->genReg(insn.dst(0));
+ const GenRegister tmp1 = ra->genReg(insn.dst(1));
GenRegister src;
uint32_t srcNum = insn.srcNum;
- if (insn.extra.continueFlag)
- srcNum--;
GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
GenRegister data = GenRegister::retype(tmp1, GEN_TYPE_UD);
+ bool useSends = insn.extra.printfSplitSend;
if (!insn.extra.continueFlag) {
p->push(); {
@@ -3429,18 +3518,18 @@ namespace gbe
p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
} p->pop();
- p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, GenRegister::immud(insn.extra.printfBTI), 2);
+ p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, data, GenRegister::immud(insn.extra.printfBTI), 2, useSends);
/* Write out the header. */
p->MOV(data, GenRegister::immud(0xAABBCCDD));
- p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+ p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
- p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+ p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
p->MOV(data, GenRegister::immud(insn.extra.printfNum));
- p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+ p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
}
@@ -3450,25 +3539,16 @@ namespace gbe
src = ra->genReg(insn.src(i));
if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D || src.type == GEN_TYPE_F) {
p->MOV(GenRegister::retype(data, src.type), src);
- p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+ p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
} else if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_UB ) {
p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src);
- p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+ p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
} else if (src.type == GEN_TYPE_L || src.type == GEN_TYPE_UL ) {
- emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI);
+ emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI, useSends);
}
}
-
- if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
- p->push();
- p->curr.execWidth = 1;
- }
- p->MOV(dst, GenRegister::immd(0));
- if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
- p->pop();
- }
}
void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
@@ -3489,323 +3569,41 @@ namespace gbe
}
void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD);
- const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
- const uint32_t vec_size = insn.extra.elem;
- const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), GEN_TYPE_UD);
- const uint32_t simdWidth = p->curr.execWidth;
-
- // Make header
- p->push();
- {
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0, 0));
-
- // Update the header with the current address
- p->curr.execWidth = 1;
- p->MOV(headeraddr, addr);
-
- // Put zero in the general state base address
- p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
- }
- p->pop();
- // Now read the data, oword block read can only work with simd16 and no mask
- if (vec_size == 1) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
- }
- p->pop();
- } else if (vec_size == 2) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
- }
- p->pop();
- p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
- p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8));
- } else if (vec_size == 4 || vec_size == 8) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < vec_size / 4; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- p->OBREAD(tmp, header, insn.getbti(), 8);
- }
- p->pop();
- for (uint32_t j = 0; j < 4; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, j));
- }
- } else {
- for (uint32_t i = 0; i < vec_size / 2; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- p->OBREAD(tmp, header, insn.getbti(), 8);
- for (uint32_t j = 0; j < 2; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, j*2));
- }
- }
- } else NOT_SUPPORTED;
+ const GenRegister header = ra->genReg(insn.src(0));
+ const GenRegister tmp = ra->genReg(insn.dst(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t ow_size = insn.extra.elem;
+ bool isA64 = bti == 255;
+ if (isA64)
+ p->OBREADA64(tmp, header, bti, ow_size);
+ else
+ p->OBREAD(tmp, header, bti, ow_size);
}
void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
- const uint32_t vec_size = insn.extra.elem;
- const GenRegister tmp = GenRegister::offset(header, 1);
- const uint32_t simdWidth = p->curr.execWidth;
- uint32_t tmp_size = simdWidth * vec_size / 8;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
-
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the current address
- p->curr.execWidth = 1;
- p->SHR(headeraddr, addr, GenRegister::immud(4));
-
- // Put zero in the general state base address
- p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-
- p->pop();
- // Now write the data, oword block write can only work with simd16 and no mask
- if (vec_size == 1) {
- p->MOV(tmp, ra->genReg(insn.src(1)));
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), simdWidth / 4);
- }
- p->pop();
- } else if (vec_size == 2) {
- p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ;
- p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) ;
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), simdWidth / 2);
- }
- p->pop();
- } else if (vec_size == 4 || vec_size == 8) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < vec_size / 4; i++) {
- for (uint32_t j = 0; j < 4; j++)
- p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + i*4))) ;
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
- }
- p->pop();
- }
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), 8);
- }
- p->pop();
- }
- } else {
- for (uint32_t i = 0; i < vec_size / 2; i++) {
- for (uint32_t j = 0; j < 2; j++)
- p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j + i*2))) ;
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
- }
- p->pop();
- }
- p->OBWRITE(header, insn.getbti(), 8);
- }
- }
- } else NOT_SUPPORTED;
-
+ const GenRegister header = ra->genReg(insn.src(0));
+ const GenRegister data = ra->genReg(insn.src(1));
+ const uint32_t bti = insn.getbti();
+ const uint32_t ow_size = insn.extra.elem;
+ bool isA64 = bti == 255;
+ if (isA64)
+ p->OBWRITEA64(header, bti, ow_size);
+ else
+ p->OBWRITE(header, data, bti, ow_size, insn.extra.splitSend);
}
void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst = ra->genReg(insn.dst(1));
- const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
- const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
- const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
- const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
- size_t vec_size = insn.extra.elem;
- uint32_t blocksize = 0x1F | (vec_size-1) << 16;
-
- if (simdWidth == 8)
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(dst, header, insn.getbti(), vec_size);
- p->pop();
-
- }
- else if (simdWidth == 16)
- {
- const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // First half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(tmp, header, insn.getbti(), vec_size);
- for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(ra->genReg(insn.dst(i + 1)), GenRegister::offset(tmp, i));
-
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(tmp, header, insn.getbti(), vec_size);
-
- // Move the reg to fit vector rule.
- for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
- GenRegister::offset(tmp, i));
- p->pop();
- } else NOT_IMPLEMENTED;
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister header = ra->genReg(insn.src(0));
+ const size_t response_size = insn.extra.elem;
+ p->MBREAD(dst, header, insn.getbti(), response_size);
}
void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
- const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- GenRegister offsetx, offsety, blocksizereg;
- size_t vec_size = insn.extra.elem;
- uint32_t blocksize = 0x1F | (vec_size-1) << 16;
-
- offsetx = GenRegister::offset(header, 0, 0*4);
- offsety = GenRegister::offset(header, 0, 1*4);
- blocksizereg = GenRegister::offset(header, 0, 2*4);
-
- if (simdWidth == 8)
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
- // Now read the data
- p->MBWRITE(header, insn.getbti(), vec_size);
- p->pop();
-
- }
- else
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // First half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
- p->MBWRITE(header, insn.getbti(), vec_size);
-
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
- // Now write the data
- p->MBWRITE(header, insn.getbti(), vec_size);
-
- p->pop();
- }
+ const GenRegister header = ra->genReg(insn.dst(0));
+ const GenRegister data = ra->genReg(insn.dst(1));
+ const size_t data_size = insn.extra.elem;
+ p->MBWRITE(header, data, insn.getbti(), data_size, insn.extra.splitSend);
}
BVAR(OCL_OUTPUT_REG_ALLOC, false);
@@ -3841,9 +3639,11 @@ namespace gbe
sel->select();
if (OCL_OPTIMIZE_SEL_IR)
sel->optimize();
+ sel->addID();
if (OCL_OUTPUT_SEL_IR)
- outputSelectionIR(*this, this->sel);
+ outputSelectionIR(*this, this->sel, genKernel->getName());
schedulePreRegAllocation(*this, *this->sel);
+ sel->addID();
if (UNLIKELY(ra->allocate(*this->sel) == false))
return false;
schedulePostRegAllocation(*this, *this->sel);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index fb3d4fe..7fd40d1 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -159,8 +159,15 @@ namespace gbe
void emitMathInstruction(const SelectionInstruction &insn);
virtual void emitRead64Instruction(const SelectionInstruction &insn);
virtual void emitWrite64Instruction(const SelectionInstruction &insn);
+ virtual void emitRead64A64Instruction(const SelectionInstruction &insn);
+ virtual void emitWrite64A64Instruction(const SelectionInstruction &insn);
+ virtual void emitAtomicA64Instruction(const SelectionInstruction &insn);
void emitUntypedReadInstruction(const SelectionInstruction &insn);
void emitUntypedWriteInstruction(const SelectionInstruction &insn);
+ virtual void emitUntypedReadA64Instruction(const SelectionInstruction &insn);
+ virtual void emitUntypedWriteA64Instruction(const SelectionInstruction &insn);
+ virtual void emitByteGatherA64Instruction(const SelectionInstruction &insn);
+ virtual void emitByteScatterA64Instruction(const SelectionInstruction &insn);
void emitAtomicInstruction(const SelectionInstruction &insn);
void emitByteGatherInstruction(const SelectionInstruction &insn);
void emitByteScatterInstruction(const SelectionInstruction &insn);
@@ -236,7 +243,7 @@ namespace gbe
void calcGlobalXYZRange(GenRegister& reg, GenRegister& tmp, int flag, int subFlag);
virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
- virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& data, GenRegister& src, uint32_t bti);
+ virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& data, GenRegister& src, uint32_t bti, bool useSends);
private:
CompileErrorCode errCode;
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index bcbb23f..c34e1bb 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -54,6 +54,7 @@
#include <stdint.h>
#include "backend/gen7_instruction.hpp"
#include "backend/gen8_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
/////////////////////////////////////////////////////////////////////////////
// Gen EU defines
@@ -129,6 +130,7 @@ enum opcode {
GEN_OPCODE_CMPN = 17,
GEN_OPCODE_F32TO16 = 19,
GEN_OPCODE_F16TO32 = 20,
+ GEN_OPCODE_BFREV = 23,
GEN_OPCODE_JMPI = 32,
GEN_OPCODE_BRD = 33,
GEN_OPCODE_IF = 34,
@@ -147,6 +149,7 @@ enum opcode {
GEN_OPCODE_WAIT = 48,
GEN_OPCODE_SEND = 49,
GEN_OPCODE_SENDC = 50,
+ GEN_OPCODE_SENDS = 51,
GEN_OPCODE_MATH = 56,
GEN_OPCODE_ADD = 64,
GEN_OPCODE_MUL = 65,
@@ -357,6 +360,14 @@ enum GenMessageTarget {
#define GEN75_P1_ATOMIC_COUNTER_4X2 12 //1100: Atomic Counter Operation 4X2
#define GEN75_P1_TYPED_SURFACE_WRITE 13 //1101: Typed Surface Write
+#define GEN8_P1_BLOCK_READ_A64 20 //10100
+#define GEN8_P1_BLOCK_WRITE_A64 21 //10101
+#define GEN8_P1_BYTE_GATHER_A64 16 //10000
+#define GEN8_P1_UNTYPED_READ_A64 17 //10001
+#define GEN8_P1_UNTYPED_ATOMIC_A64 18 //10010
+#define GEN8_P1_UNTYPED_WRITE_A64 25 //11001
+#define GEN8_P1_BYTE_SCATTER_A64 26 //11010
+
/* Data port data cache scratch messages*/
#define GEN_SCRATCH_READ 0
#define GEN_SCRATCH_WRITE 1
@@ -417,6 +428,7 @@ enum GenMessageTarget {
#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6
#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7
#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10
+#define GEN_SAMPLER_MESSAGE_CACHE_FLUSH 0x1f
/* for GEN5 only */
#define GEN_SAMPLER_SIMD_MODE_SIMD4X2 0
@@ -549,6 +561,7 @@ union GenNativeInstruction
};
union Gen7NativeInstruction gen7_insn;
union Gen8NativeInstruction gen8_insn;
+ union Gen9NativeInstruction gen9_insn;
//Gen7 & Gen8 common field
struct {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 975e1c7..03ce0e2 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -203,7 +203,6 @@ namespace gbe
unsigned msg_length, unsigned response_length,
bool header_present, bool end_of_thread)
{
- setSrc1(inst, GenRegister::immd(0));
inst->bits3.generic_gen5.header_present = header_present;
inst->bits3.generic_gen5.response_length = response_length;
inst->bits3.generic_gen5.msg_length = msg_length;
@@ -238,8 +237,7 @@ namespace gbe
NOT_SUPPORTED;
}
- static void setDPByteScatterGather(GenEncoder *p,
- GenNativeInstruction *insn,
+ void GenEncoder::setDPByteScatterGather(GenNativeInstruction *insn,
uint32_t bti,
uint32_t elem_size,
uint32_t msg_type,
@@ -247,44 +245,59 @@ namespace gbe
uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
- p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
insn->bits3.gen7_byte_rw.msg_type = msg_type;
insn->bits3.gen7_byte_rw.bti = bti;
insn->bits3.gen7_byte_rw.data_size = elem_size;
- if (p->curr.execWidth == 8)
+ if (curr.execWidth == 8)
insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
- else if (p->curr.execWidth == 16)
+ else if (curr.execWidth == 16)
insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
else
NOT_SUPPORTED;
}
- static void setOBlockRW(GenEncoder *p,
- GenNativeInstruction *insn,
- uint32_t bti,
- uint32_t size,
- uint32_t msg_type,
- uint32_t msg_length,
- uint32_t response_length)
+ void GenEncoder::setOBlockRW(GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t block_size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
- p->setMessageDescriptor(insn, sfid, msg_length, response_length);
- assert(size == 2 || size == 4 || size == 8);
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
insn->bits3.gen7_oblock_rw.msg_type = msg_type;
insn->bits3.gen7_oblock_rw.bti = bti;
- insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4);
+ insn->bits3.gen7_oblock_rw.block_size = block_size;
insn->bits3.gen7_oblock_rw.header_present = 1;
}
- static void setMBlockRW(GenEncoder *p,
- GenNativeInstruction *insn,
- uint32_t bti,
- uint32_t msg_type,
- uint32_t msg_length,
- uint32_t response_length)
+ uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half)
+ {
+ /* 000: 1 OWord, read into or written from the low 128 bits of the destination register.
+ * 001: 1 OWord, read into or written from the high 128 bits of the destination register.
+ * 010: 2 OWords
+ * 011: 4 OWords
+ * 100: 8 OWords */
+ switch(oword_size)
+ {
+ case 1: return low_half ? 0 : 1;
+ case 2: return 2;
+ case 4: return 3;
+ case 8: return 4;
+ default: NOT_SUPPORTED;
+ }
+ return 0;
+ }
+
+ void GenEncoder::setMBlockRW(GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
- p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
insn->bits3.gen7_mblock_rw.msg_type = msg_type;
insn->bits3.gen7_mblock_rw.bti = bti;
insn->bits3.gen7_mblock_rw.header_present = 1;
@@ -411,7 +424,31 @@ namespace gbe
return insn->bits3.ud;
}
- void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+ unsigned GenEncoder::generateUntypedWriteSendsMessageDesc(unsigned bti, unsigned elemNum) {
+ GenNativeInstruction insn;
+ memset(&insn, 0, sizeof(GenNativeInstruction));
+ return setUntypedWriteSendsMessageDesc(&insn, bti, elemNum);
+ }
+
+ unsigned GenEncoder::setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum)
+ {
+ assert(0);
+ return 0;
+ }
+
+ void GenEncoder::UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum) {
+ assert(0);
+ }
+
+ void GenEncoder::UNTYPED_WRITEA64(GenRegister src, uint32_t elemNum){
+ assert(0);
+ }
+
+ void GenEncoder::ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+ assert(0);
+ }
+
+ void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
this->setHeader(insn);
@@ -449,8 +486,7 @@ namespace gbe
response_length = 2;
} else
NOT_IMPLEMENTED;
- setDPByteScatterGather(this,
- insn,
+ setDPByteScatterGather(insn,
bti,
elemSize,
GEN7_BYTE_GATHER,
@@ -482,6 +518,18 @@ namespace gbe
return setByteScatterMessageDesc(&insn, bti, elemSize);
}
+ unsigned GenEncoder::generateByteScatterSendsMessageDesc(unsigned bti, unsigned elemSize) {
+ GenNativeInstruction insn;
+ memset(&insn, 0, sizeof(GenNativeInstruction));
+ return setByteScatterSendsMessageDesc(&insn, bti, elemSize);
+ }
+
+ unsigned GenEncoder::setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize)
+ {
+ assert(0);
+ return 0;
+ }
+
unsigned GenEncoder::setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
@@ -492,8 +540,7 @@ namespace gbe
} else
NOT_IMPLEMENTED;
- setDPByteScatterGather(this,
- insn,
+ setDPByteScatterGather(insn,
bti,
elemSize,
GEN7_BYTE_SCATTER,
@@ -502,7 +549,7 @@ namespace gbe
return insn->bits3.ud;
}
- void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister bti, uint32_t elemSize) {
+ void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
@@ -524,6 +571,13 @@ namespace gbe
this->setSrc1(insn, bti);
}
}
+ void GenEncoder::BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize) {
+ assert(0);
+ }
+
+ void GenEncoder::BYTE_SCATTERA64(GenRegister src, uint32_t elemSize){
+ assert(0);
+ }
void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
@@ -589,15 +643,19 @@ namespace gbe
NOT_SUPPORTED;
return insn->bits3.ud;
}
+ unsigned GenEncoder::setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long) {
+ GBE_ASSERT(0);
+ return 0;
+ }
- void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+ void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(addr.nr, 0));
if (bti.file == GEN_IMMEDIATE_VALUE) {
this->setSrc1(insn, GenRegister::immud(0));
setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
@@ -795,6 +853,7 @@ namespace gbe
ALU2(MACH)
ALU3(MAD)
ALU3(LRP)
+ ALU1(BFREV)
// ALU2(BRC)
// ALU1(ENDIF)
// ALU1(IF)
@@ -873,27 +932,30 @@ namespace gbe
this->setHeader(insn);
this->setDst(insn, GenRegister::null());
this->setSrc0(insn, src);
+ this->setSrc1(insn, GenRegister::immud(0));
setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
insn->bits3.msg_gateway.sub_function_id = GEN_BARRIER_MSG;
insn->bits3.msg_gateway.notify = 0x1;
}
void GenEncoder::FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN) {
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- this->setHeader(insn);
- this->setDst(insn, GenRegister::null());
- this->setSrc0(insn, src);
- setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
- insn->bits3.msg_gateway.sub_function_id = GEN_FORWARD_MSG;
- GBE_ASSERT(notifyN <= 2);
- insn->bits3.msg_gateway.notify = notifyN;
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::null());
+ this->setSrc0(insn, src);
+ this->setSrc1(insn, GenRegister::immud(0));
+ setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
+ insn->bits3.msg_gateway.sub_function_id = GEN_FORWARD_MSG;
+ GBE_ASSERT(notifyN <= 2);
+ insn->bits3.msg_gateway.notify = notifyN;
}
- void GenEncoder::FENCE(GenRegister dst) {
+ void GenEncoder::FENCE(GenRegister dst, bool flushRWCache) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
this->setDst(insn, dst);
this->setSrc0(insn, dst);
+ this->setSrc1(insn, GenRegister::immud(0));
setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
insn->bits3.gen7_memory_fence.commit_enable = 0x1;
@@ -1154,11 +1216,16 @@ namespace gbe
this->setHeader(insn);
this->setDst(insn, dest);
this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
simd_mode, return_format);
}
+ void GenEncoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+ // only Gen8+ support flushing sampler cache
+ assert(0);
+ }
void GenEncoder::setVmeMessage(GenNativeInstruction *insn,
unsigned char bti,
@@ -1200,19 +1267,21 @@ namespace gbe
this->setHeader(insn);
this->setDst(insn, dest);
this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
setVmeMessage(insn, bti, response_length, msg_length,
msg_type, vme_search_path_lut, lut_sub);
}
- void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
+ void GenEncoder::TYPED_WRITE(GenRegister msg, GenRegister data, bool header_present, unsigned char bti, bool useSends)
{
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- uint32_t msg_type = GEN_TYPED_WRITE;
- uint32_t msg_length = header_present ? 9 : 8;
- this->setHeader(insn);
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
- this->setSrc0(insn, msg);
- setTypedWriteMessage(insn, bti, msg_type, msg_length, header_present);
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_type = GEN_TYPED_WRITE;
+ uint32_t msg_length = header_present ? 9 : 8;
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
+ setTypedWriteMessage(insn, bti, msg_type, msg_length, header_present);
}
static void setScratchMessage(GenEncoder *p,
GenNativeInstruction *insn,
@@ -1258,72 +1327,86 @@ namespace gbe
setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
}
- void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1;
- const uint32_t response_length = size / 2; // Size is in regs
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+ const uint32_t response_length = sizeinreg; // Size is in reg
+
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
- setOBlockRW(this,
- insn,
+ setOBlockRW(insn,
bti,
- size,
+ block_size,
GEN7_UNALIGNED_OBLOCK_READ,
msg_length,
response_length);
}
- void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- const uint32_t msg_length = 1 + size / 2; // Size is in owords
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
const uint32_t response_length = 0;
+ const uint32_t block_size = getOBlockSize(ow_size);
+
this->setHeader(insn);
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
- setOBlockRW(this,
- insn,
+ setOBlockRW(insn,
bti,
- size,
+ block_size,
GEN7_OBLOCK_WRITE,
msg_length,
response_length);
}
- void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1;
- const uint32_t response_length = size; // Size of registers
+ const uint32_t response_length = response_size; // Size of registers
this->setHeader(insn);
this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
- setMBlockRW(this,
- insn,
+ setMBlockRW(insn,
bti,
GEN75_P1_MEDIA_BREAD,
msg_length,
response_length);
}
- void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- const uint32_t msg_length = 1 + size;
+ const uint32_t msg_length = 1 + data_size;
const uint32_t response_length = 0; // Size of registers
this->setHeader(insn);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
- setMBlockRW(this,
- insn,
+ setMBlockRW(insn,
bti,
GEN75_P1_MEDIA_TYPED_BWRITE,
msg_length,
response_length);
}
+ void GenEncoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize) {
+ NOT_SUPPORTED;
+ }
+
+ void GenEncoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize) {
+ NOT_SUPPORTED;
+ }
+
void GenEncoder::EOT(uint32_t msg) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 0f835ca..3e45c81 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -131,6 +131,7 @@ namespace gbe
ALU3(LRP)
ALU2(BRC)
ALU1(BRD)
+ ALU1(BFREV)
#undef ALU1
#undef ALU2
#undef ALU2_MOD
@@ -144,7 +145,7 @@ namespace gbe
/*! Forward the gateway message. */
void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0);
/*! Memory fence message (to order loads and stores between threads) */
- void FENCE(GenRegister dst);
+ virtual void FENCE(GenRegister dst, bool flushRWCache);
/*! Jump indexed instruction */
virtual void JMPI(GenRegister src, bool longjmp = false);
/*! IF indexed instruction */
@@ -170,15 +171,25 @@ namespace gbe
/*! Wait instruction (used for the barrier) */
void WAIT(uint32_t n = 0);
/*! Atomic instructions */
- virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+ /*! AtomicA64 instructions */
+ virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
/*! Untyped read (upto 4 channels) */
virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
- virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
+ virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
+ /*! Untyped read A64(upto 4 channels) */
+ virtual void UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum);
+ /*! Untyped write (upto 4 channels) */
+ virtual void UNTYPED_WRITEA64(GenRegister src, uint32_t elemNum);
/*! Byte gather (for unaligned bytes, shorts and ints) */
void BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
- void BYTE_SCATTER(GenRegister src, GenRegister bti, uint32_t elemSize);
+ virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends);
+ /*! Byte gather a64 (for unaligned bytes, shorts and ints) */
+ virtual void BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize);
+ /*! Byte scatter a64 (for unaligned bytes, shorts and ints) */
+ virtual void BYTE_SCATTERA64(GenRegister src, uint32_t elemSize);
/*! DWord gather (for constant cache read) */
void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
/*! for scratch memory read */
@@ -219,11 +230,14 @@ namespace gbe
uint32_t msg_type,
unsigned char vme_search_path_lut,
unsigned char lut_sub);
+ virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
/*! TypedWrite instruction for texture */
virtual void TYPED_WRITE(GenRegister header,
+ GenRegister data,
bool header_present,
- unsigned char bti);
+ unsigned char bti,
+ bool useSends);
/*! Extended math function (2 sources) */
void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
/*! Extended math function (1 source) */
@@ -235,6 +249,8 @@ namespace gbe
////////////////////////////////////////////////////////////////////////
// Helper functions to encode
////////////////////////////////////////////////////////////////////////
+ void setDPByteScatterGather(GenNativeInstruction *insn, uint32_t bti, uint32_t elem_size,
+ uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
uint32_t msg_type, uint32_t msg_length,
uint32_t response_length);
@@ -245,16 +261,21 @@ namespace gbe
unsigned msg_length, unsigned response_length,
bool header_present = false, bool end_of_thread = false);
virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+ virtual unsigned setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long);
virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+ virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
unsigned setByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
unsigned setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+ virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
unsigned generateAtomicMessageDesc(unsigned function, unsigned bti, unsigned srcNum);
unsigned generateUntypedReadMessageDesc(unsigned bti, unsigned elemNum);
unsigned generateUntypedWriteMessageDesc(unsigned bti, unsigned elemNum);
+ unsigned generateUntypedWriteSendsMessageDesc(unsigned bti, unsigned elemNum);
unsigned generateByteGatherMessageDesc(unsigned bti, unsigned elemSize);
unsigned generateByteScatterMessageDesc(unsigned bti, unsigned elemSize);
+ unsigned generateByteScatterSendsMessageDesc(unsigned bti, unsigned elemSize);
virtual void setHeader(GenNativeInstruction *insn) = 0;
virtual void setDst(GenNativeInstruction *insn, GenRegister dest) = 0;
@@ -267,14 +288,24 @@ namespace gbe
virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
GenRegister src1 = GenRegister::null());
virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
+
+ /*! OBlock helper function */
+ uint32_t getOBlockSize(uint32_t oword_size, bool low_half = true);
+ void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
+ void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t block_size, uint32_t msg_type, uint32_t msg_length, uint32_t response_lengtha);
+
/*! OBlock read */
- void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
/*! OBlock write */
- void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends);
/*! MBlock read */
- virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size);
/*! MBlock write */
- virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends);
+ /*! A64 OBlock read */
+ virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
+ /*! A64 OBlock write */
+ virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size);
GBE_CLASS(GenEncoder); //!< Use custom allocators
virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index c396626..c75557c 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -26,8 +26,14 @@ DECL_GEN7_SCHEDULE(Barrier, 80, 1, 1)
DECL_GEN7_SCHEDULE(Fence, 80, 1, 1)
DECL_GEN7_SCHEDULE(Read64, 80, 1, 1)
DECL_GEN7_SCHEDULE(Write64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(Read64A64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(Write64A64, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedRead, 160, 1, 1)
DECL_GEN7_SCHEDULE(UntypedWrite, 160, 1, 1)
+DECL_GEN7_SCHEDULE(UntypedReadA64, 160, 1, 1)
+DECL_GEN7_SCHEDULE(UntypedWriteA64, 160, 1, 1)
+DECL_GEN7_SCHEDULE(ByteGatherA64, 160, 1, 1)
+DECL_GEN7_SCHEDULE(ByteScatterA64, 160, 1, 1)
DECL_GEN7_SCHEDULE(ByteGather, 160, 1, 1)
DECL_GEN7_SCHEDULE(ByteScatter, 160, 1, 1)
DECL_GEN7_SCHEDULE(DWordGather, 160, 1, 1)
@@ -41,6 +47,7 @@ DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(SpillReg, 20, 1, 1)
DECL_GEN7_SCHEDULE(UnSpillReg, 160, 1, 1)
DECL_GEN7_SCHEDULE(Atomic, 80, 1, 1)
+DECL_GEN7_SCHEDULE(AtomicA64, 80, 1, 1)
DECL_GEN7_SCHEDULE(I64MUL, 20, 40, 20)
DECL_GEN7_SCHEDULE(I64SATADD, 20, 40, 20)
DECL_GEN7_SCHEDULE(I64SATSUB, 20, 40, 20)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 2b89c7f..22b0ddc 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -144,6 +144,7 @@ namespace gbe
case GEN_TYPE_UL: return TYPE_U64;
case GEN_TYPE_F: return TYPE_FLOAT;
case GEN_TYPE_DF: return TYPE_DOUBLE;
+ case GEN_TYPE_HF : return TYPE_HALF;
default: NOT_SUPPORTED; return TYPE_FLOAT;
}
}
@@ -168,7 +169,7 @@ namespace gbe
SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
parent(NULL), opcode(op), dstNum(dst), srcNum(src)
{
- extra.function = 0;
+ extra = { 0 };
}
void SelectionInstruction::prepend(SelectionInstruction &other) {
@@ -183,9 +184,13 @@ namespace gbe
bool SelectionInstruction::isRead(void) const {
return this->opcode == SEL_OP_UNTYPED_READ ||
+ this->opcode == SEL_OP_UNTYPED_READA64 ||
this->opcode == SEL_OP_READ64 ||
+ this->opcode == SEL_OP_READ64A64 ||
this->opcode == SEL_OP_ATOMIC ||
+ this->opcode == SEL_OP_ATOMICA64 ||
this->opcode == SEL_OP_BYTE_GATHER ||
+ this->opcode == SEL_OP_BYTE_GATHERA64 ||
this->opcode == SEL_OP_SAMPLE ||
this->opcode == SEL_OP_VME ||
this->opcode == SEL_OP_DWORD_GATHER ||
@@ -209,9 +214,13 @@ namespace gbe
bool SelectionInstruction::isWrite(void) const {
return this->opcode == SEL_OP_UNTYPED_WRITE ||
+ this->opcode == SEL_OP_UNTYPED_WRITEA64 ||
this->opcode == SEL_OP_WRITE64 ||
+ this->opcode == SEL_OP_WRITE64A64 ||
this->opcode == SEL_OP_ATOMIC ||
+ this->opcode == SEL_OP_ATOMICA64 ||
this->opcode == SEL_OP_BYTE_SCATTER ||
+ this->opcode == SEL_OP_BYTE_SCATTERA64 ||
this->opcode == SEL_OP_TYPED_WRITE ||
this->opcode == SEL_OP_OBWRITE ||
this->opcode == SEL_OP_MBWRITE;
@@ -225,6 +234,50 @@ namespace gbe
return this->opcode == SEL_OP_LABEL;
}
+ bool SelectionInstruction::sameAsDstRegion(uint32_t srcID) {
+ assert(srcID < srcNum);
+ if (dstNum == 0)
+ return true;
+ GenRegister &srcReg = this->src(srcID);
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const GenRegister &dstReg = this->dst(dstID);
+ if (!dstReg.isSameRegion(srcReg))
+ return false;
+ }
+ return true;
+ }
+
+ bool SelectionInstruction::isNative(void) const {
+ return this->opcode == SEL_OP_NOT || /* ALU1 */
+ this->opcode == SEL_OP_LZD ||
+ this->opcode == SEL_OP_RNDZ ||
+ this->opcode == SEL_OP_RNDE ||
+ this->opcode == SEL_OP_RNDD ||
+ this->opcode == SEL_OP_RNDU ||
+ this->opcode == SEL_OP_FRC ||
+ this->opcode == SEL_OP_F16TO32 ||
+ this->opcode == SEL_OP_F32TO16 ||
+ this->opcode == SEL_OP_CBIT ||
+ this->opcode == SEL_OP_SEL || /* ALU2 */
+ this->opcode == SEL_OP_AND ||
+ this->opcode == SEL_OP_OR ||
+ this->opcode == SEL_OP_XOR ||
+ this->opcode == SEL_OP_SHR ||
+ this->opcode == SEL_OP_SHL ||
+ this->opcode == SEL_OP_RSR ||
+ this->opcode == SEL_OP_RSL ||
+ this->opcode == SEL_OP_ASR ||
+ this->opcode == SEL_OP_SEL ||
+ this->opcode == SEL_OP_ADD ||
+ this->opcode == SEL_OP_MUL ||
+ this->opcode == SEL_OP_FBH ||
+ this->opcode == SEL_OP_FBL ||
+ this->opcode == SEL_OP_MACH ||
+ this->opcode == SEL_OP_MATH ||
+ this->opcode == SEL_OP_LRP || /* ALU3 */
+ this->opcode == SEL_OP_MAD;
+ }
+
///////////////////////////////////////////////////////////////////////////
// SelectionVector
///////////////////////////////////////////////////////////////////////////
@@ -237,7 +290,7 @@ namespace gbe
// SelectionBlock
///////////////////////////////////////////////////////////////////////////
- SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), isLargeBlock(false), endifLabel( (ir::LabelIndex) 0), removeSimpleIfEndif(false){}
+ SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), endifLabel( (ir::LabelIndex) 0), removeSimpleIfEndif(false){}
void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); }
@@ -364,7 +417,9 @@ namespace gbe
/*! spill a register (insert spill/unspill instructions) */
INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
bool has32X32Mul() const { return bHas32X32Mul; }
+ bool hasSends() const { return bHasSends; }
void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
+ void setHasSends(bool b) { bHasSends = b; }
bool hasLongType() const { return bHasLongType; }
bool hasDoubleType() const { return bHasDoubleType; }
bool hasHalfType() const { return bHasHalfType; }
@@ -570,6 +625,7 @@ namespace gbe
I64Shift(I64SHL)
I64Shift(I64SHR)
I64Shift(I64ASR)
+ ALU1(BFREV)
#undef ALU1
#undef ALU1WithTemp
#undef ALU2
@@ -632,10 +688,16 @@ namespace gbe
void WAIT(uint32_t n = 0);
/*! Atomic instruction */
void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, vector<GenRegister> temps);
+ /*! AtomicA64 instruction */
+ void ATOMICA64(Reg dst, uint32_t function, uint32_t srcNum, vector<GenRegister> src, GenRegister bti, vector<GenRegister> temps);
/*! Read 64 bits float/int array */
void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, vector<GenRegister> temps);
/*! Write 64 bits float/int array */
void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, vector<GenRegister> temps);
+ /*! Read64 A64 */
+ void READ64A64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum);
+ /*! write64 a64 */
+ void WRITE64A64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, vector<GenRegister> temps);
/*! Untyped write (up to 4 elements) */
@@ -644,6 +706,14 @@ namespace gbe
void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, vector<GenRegister> temps);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, vector <GenRegister> temps);
+ /*! Byte gather a64 (for unaligned bytes, shorts and ints) */
+ void BYTE_GATHERA64(Reg dst, Reg addr, uint32_t elemSize);
+ /*! Byte scatter (for unaligned bytes, shorts and ints) */
+ void BYTE_SCATTERA64(GenRegister *msg, unsigned msgNum, uint32_t elemSize);
+ /*! Untyped read (up to 4 elements) */
+ void UNTYPED_READA64(Reg addr, const GenRegister *dst, uint32_t dstNum, uint32_t elemNum);
+ /*! Untyped write (up to 4 elements) */
+ void UNTYPED_WRITEA64(const GenRegister *msgs, uint32_t msgNum, uint32_t elemNum);
/*! DWord scatter (for constant cache read) */
void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
/*! Unpack the uint to charN */
@@ -681,7 +751,7 @@ namespace gbe
/*! Store the profiling info */
void STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum);
/*! Printf */
- void PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister src[8],
+ void PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister src[8],
int srcNum, uint16_t num, bool isContinue, uint32_t totalSize);
/*! Multiply 64-bit integers */
void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool native_long);
@@ -696,19 +766,19 @@ namespace gbe
GenRegister tmpData1,
GenRegister localThreadID, GenRegister localThreadNUM,
GenRegister tmpData2, GenRegister slmOff,
- vector<GenRegister> msg, uint32_t msgSizeReq,
+ vector<GenRegister> msg,
GenRegister localBarrier);
/*! Sub Group Operations */
void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
GenRegister tmpData1, GenRegister tmpData2);
/*! Oblock read */
- void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+ void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size);
/*! Oblock write */
- void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+ void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size);
/*! Media block read */
- void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+ void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t response_size);
/*! Media block write */
- void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+ void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t data_size);
/* common functions for both binary instruction and sel_cmp and compare instruction.
It will handle the IMM or normal register assignment, and will try to avoid LOADI
@@ -777,9 +847,9 @@ namespace gbe
GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
}
- INLINE vector<GenRegister> getBTITemps(const ir::BTI &bti) {
+ INLINE vector<GenRegister> getBTITemps(const ir::AddressMode &AM) {
vector<GenRegister> temps;
- if (!bti.isConst) {
+ if (AM == ir::AM_DynamicBti) {
temps.push_back(selReg(reg(ir::FAMILY_WORD, true), ir::TYPE_U16));
temps.push_back(selReg(reg(ir::FAMILY_DWORD, true), ir::TYPE_U32));
}
@@ -798,6 +868,7 @@ namespace gbe
bool bHasDoubleType;
bool bHasHalfType;
bool bLongRegRestrict;
+ bool bHasSends;
uint32_t ldMsgOrder;
bool slowByteGather;
INLINE ir::LabelIndex newAuxLabel()
@@ -840,7 +911,7 @@ namespace gbe
maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
stateNum(0), vectorNum(0), bwdCodeGeneration(false), storeThreadMap(false),
currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false), bHasLongType(false),
- bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false),
+ bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false), bHasSends(false),
ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)
{
const ir::Function &fn = ctx.getFunction();
@@ -1094,7 +1165,7 @@ namespace gbe
if(this->block->removeSimpleIfEndif){
mov->state.predicate = GEN_PREDICATE_NORMAL;
mov->state.flag = 0;
- mov->state.subFlag = 0;
+ mov->state.subFlag = 1;
}
if (this->isScalarReg(insn->src(regID).reg()))
mov->state.noMask = 1;
@@ -1128,7 +1199,7 @@ namespace gbe
if(this->block->removeSimpleIfEndif){
mov->state.predicate = GEN_PREDICATE_NORMAL;
mov->state.flag = 0;
- mov->state.subFlag = 0;
+ mov->state.subFlag = 1;
}
if (simdWidth == 1) {
mov->state.noMask = 1;
@@ -1318,6 +1389,48 @@ namespace gbe
insn->extra.function = function;
insn->extra.elem = msgPayload;
+ if (hasSends() && msgPayload > 1) {
+ insn->extra.splitSend = 1;
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = 1;
+ vector->offsetID = 0;
+ vector->reg = &insn->src(0);
+ vector->isSrc = 1;
+
+ vector = this->appendVector();
+ vector->regNum = msgPayload - 1;
+ vector->offsetID = 1;
+ vector->reg = &insn->src(1);
+ vector->isSrc = 1;
+ } else {
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = msgPayload; //bti not included in SelectionVector
+ vector->offsetID = 0;
+ vector->reg = &insn->src(0);
+ vector->isSrc = 1;
+ }
+ }
+
+ void Selection::Opaque::ATOMICA64(Reg dst, uint32_t function,
+ uint32_t msgPayload, vector<GenRegister> src,
+ GenRegister bti,
+ vector<GenRegister> temps) {
+ unsigned dstNum = 1 + temps.size();
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMICA64, dstNum, msgPayload + 1);
+
+ insn->dst(0) = dst;
+ if(temps.size()) {
+ insn->dst(1) = temps[0];
+ insn->dst(2) = temps[1];
+ }
+
+ for (uint32_t elemID = 0; elemID < msgPayload; ++elemID)
+ insn->src(elemID) = src[elemID];
+ insn->src(msgPayload) = bti;
+
+ insn->extra.function = function;
+ insn->extra.elem = msgPayload;
+
SelectionVector *vector = this->appendVector();
vector->regNum = msgPayload; //bti not included in SelectionVector
vector->offsetID = 0;
@@ -1399,6 +1512,39 @@ namespace gbe
srcVector->reg = &insn->src(0);
}
+ void Selection::Opaque::READ64A64(Reg addr,
+ const GenRegister *dst,
+ const GenRegister *tmp,
+ uint32_t elemNum)
+ {
+ SelectionInstruction *insn = NULL;
+ SelectionVector *srcVector = NULL;
+ SelectionVector *dstVector = NULL;
+ insn = this->appendInsn(SEL_OP_READ64A64,elemNum*2, 1);
+ srcVector = this->appendVector();
+ dstVector = this->appendVector();
+
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->dst(elemID) = tmp[elemID];
+
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->dst(elemID + elemNum) = dst[elemID];
+
+ insn->src(0) = addr;
+
+ insn->extra.elem = elemNum;
+
+ dstVector->regNum = elemNum;
+ dstVector->isSrc = 0;
+ dstVector->offsetID = 0;
+ dstVector->reg = &insn->dst(0);
+
+ srcVector->regNum = 1;
+ srcVector->offsetID = 0;
+ srcVector->isSrc = 1;
+ srcVector->reg = &insn->src(0);
+ }
+
void Selection::Opaque::UNTYPED_READ(Reg addr,
const GenRegister *dst,
uint32_t elemNum,
@@ -1439,6 +1585,34 @@ namespace gbe
srcVector->offsetID = 0;
srcVector->reg = &insn->src(0);
}
+ void Selection::Opaque::UNTYPED_READA64(Reg addr,
+ const GenRegister *dst,
+ uint32_t dstNum,
+ uint32_t elemNum)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READA64, dstNum, 1);
+ SelectionVector *srcVector = this->appendVector();
+ SelectionVector *dstVector = this->appendVector();
+ if (this->isScalarReg(dst[0].reg()))
+ insn->state.noMask = 1;
+ // Regular instruction to encode
+ for (uint32_t id = 0; id < dstNum; ++id)
+ insn->dst(id) = dst[id];
+
+ insn->src(0) = addr;
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation
+ dstVector->regNum = dstNum;
+ dstVector->isSrc = 0;
+ dstVector->offsetID = 0;
+ dstVector->reg = &insn->dst(0);
+
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->offsetID = 0;
+ srcVector->reg = &insn->src(0);
+ }
void Selection::Opaque::WRITE64(Reg addr,
const GenRegister *src,
@@ -1479,7 +1653,6 @@ namespace gbe
// dst: srcNum, (flagTemp)
// src: srcNum, addr, srcNum, bti.
insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
- vector = this->appendVector();
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->src(elemID) = src[elemID];
@@ -1500,10 +1673,29 @@ namespace gbe
}
insn->extra.elem = srcNum;
- vector->regNum = srcNum + 1;
- vector->offsetID = srcNum;
- vector->reg = &insn->src(srcNum);
- vector->isSrc = 1;
+ if (hasSends()) {
+ insn->extra.splitSend = 1;
+
+ //addr regs
+ vector = this->appendVector();
+ vector->regNum = 1;
+ vector->offsetID = srcNum;
+ vector->reg = &insn->src(srcNum);
+ vector->isSrc = 1;
+
+ //data regs
+ vector = this->appendVector();
+ vector->regNum = srcNum;
+ vector->offsetID = srcNum+1;
+ vector->reg = &insn->src(srcNum+1);
+ vector->isSrc = 1;
+ } else {
+ vector = this->appendVector();
+ vector->regNum = srcNum + 1;
+ vector->offsetID = srcNum;
+ vector->reg = &insn->src(srcNum);
+ vector->isSrc = 1;
+ }
}
if (bti.file != GEN_IMMEDIATE_VALUE) {
@@ -1512,6 +1704,38 @@ namespace gbe
}
}
+ void Selection::Opaque::WRITE64A64(Reg addr,
+ const GenRegister *src,
+ const GenRegister *tmp,
+ uint32_t srcNum)
+ {
+ SelectionVector *vector = NULL;
+ SelectionInstruction *insn = NULL;
+
+ const uint32_t dstNum = srcNum;
+ insn = this->appendInsn(SEL_OP_WRITE64A64, dstNum, srcNum*2 + 1);
+ vector = this->appendVector();
+
+ for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+ insn->src(elemID) = src[elemID];
+
+ insn->src(srcNum) = addr;
+ for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+ insn->src(srcNum + 1 + elemID) = tmp[elemID];
+
+ /* We also need to add the tmp reigster to dst, in order
+ to avoid the post schedule error . */
+ for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+ insn->dst(elemID) = tmp[elemID];
+
+ insn->extra.elem = srcNum;
+
+ vector->regNum = srcNum + 1;
+ vector->offsetID = srcNum;
+ vector->reg = &insn->src(srcNum);
+ vector->isSrc = 1;
+ }
+
void Selection::Opaque::UNTYPED_WRITE(Reg addr,
const GenRegister *src,
uint32_t elemNum,
@@ -1521,7 +1745,6 @@ namespace gbe
unsigned dstNum = temps.size();
unsigned srcNum = elemNum + 2 + temps.size();
SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, srcNum);
- SelectionVector *vector = this->appendVector();
if (bti.file != GEN_IMMEDIATE_VALUE) {
insn->state.flag = 0;
@@ -1541,11 +1764,45 @@ namespace gbe
}
insn->extra.elem = elemNum;
+ if (hasSends()) {
+ insn->extra.splitSend = 1;
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = elemNum;
+ vector->reg = &insn->src(1);
+ vector->offsetID = 1;
+ vector->isSrc = 1;
+ vector = this->appendVector();
+ vector->regNum = 1;
+ vector->reg = &insn->src(0);
+ vector->offsetID = 0;
+ vector->isSrc = 1;
+ } else {
// Sends require contiguous allocation for the sources
+ SelectionVector *vector = this->appendVector();
vector->regNum = elemNum+1;
vector->reg = &insn->src(0);
vector->offsetID = 0;
vector->isSrc = 1;
+ }
+ }
+
+ void Selection::Opaque::UNTYPED_WRITEA64(const GenRegister *src,
+ uint32_t msgNum,
+ uint32_t elemNum)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITEA64, 0, msgNum);
+ SelectionVector *vector = this->appendVector();
+
+ // Regular instruction to encode
+ for (uint32_t id = 0; id < msgNum; ++id)
+ insn->src(id) = src[id];
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation for the sources
+ vector->regNum = msgNum;
+ vector->reg = &insn->src(0);
+ vector->offsetID = 0;
+ vector->isSrc = 1;
}
void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr,
@@ -1591,7 +1848,6 @@ namespace gbe
GenRegister bti, vector<GenRegister> temps) {
unsigned dstNum = temps.size();
SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3);
- SelectionVector *vector = this->appendVector();
if (bti.file != GEN_IMMEDIATE_VALUE) {
insn->state.flag = 0;
@@ -1608,8 +1864,59 @@ namespace gbe
insn->src(2) = bti;
insn->extra.elem = elemSize;
- // value and address are contiguous in the send
- vector->regNum = 2;
+ if (hasSends()) {
+ insn->extra.splitSend = 1;
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = 1;
+ vector->isSrc = 1;
+ vector->offsetID = 0;
+ vector->reg = &insn->src(0);
+
+ vector = this->appendVector();
+ vector->regNum = 1;
+ vector->isSrc = 1;
+ vector->offsetID = 1;
+ vector->reg = &insn->src(1);
+ } else {
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = 2;
+ vector->isSrc = 1;
+ vector->offsetID = 0;
+ vector->reg = &insn->src(0);
+ }
+ }
+
+ void Selection::Opaque::BYTE_GATHERA64(Reg dst, Reg addr, uint32_t elemSize) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHERA64, 1, 1);
+ SelectionVector *srcVector = this->appendVector();
+ SelectionVector *dstVector = this->appendVector();
+
+ if (this->isScalarReg(dst.reg()))
+ insn->state.noMask = 1;
+
+ insn->src(0) = addr;
+ insn->dst(0) = dst;
+ insn->extra.elem = elemSize;
+
+ dstVector->regNum = 1;
+ dstVector->isSrc = 0;
+ dstVector->offsetID = 0;
+ dstVector->reg = &insn->dst(0);
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->offsetID = 0;
+ srcVector->reg = &insn->src(0);
+ }
+
+ void Selection::Opaque::BYTE_SCATTERA64(GenRegister *msg, uint32_t msgNum, uint32_t elemSize) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTERA64, 0, msgNum);
+ SelectionVector *vector = this->appendVector();
+ for (unsigned i = 0; i < msgNum; i++)
+ insn->src(i) = msg[i];
+
+ insn->extra.elem = elemSize;
+
+ vector->regNum = msgNum;
vector->isSrc = 1;
vector->offsetID = 0;
vector->reg = &insn->src(0);
@@ -1930,51 +2237,41 @@ namespace gbe
}
}
- void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister tmp1,
+ void Selection::Opaque::PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1,
GenRegister src[8], int srcNum, uint16_t num, bool isContinue, uint32_t totalSize) {
- if (isContinue) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum + 1);
- SelectionVector *vector = this->appendVector();
-
- for (int i = 0; i < srcNum; i++)
- insn->src(i) = src[i];
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 2, srcNum);
- insn->src(srcNum) = tmp0;
+ for (int i = 0; i < srcNum; i++)
+ insn->src(i) = src[i];
- insn->dst(0) = dst;
- insn->dst(1) = tmp0;
- insn->dst(2) = tmp1;
+ insn->dst(0) = tmp0;
+ insn->dst(1) = tmp1;
- vector->regNum = 2;
- vector->reg = &insn->dst(1);
+ if (hasSends()) {
+ insn->extra.printfSplitSend = 1;
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = 1;
+ vector->reg = &insn->dst(0);
vector->offsetID = 0;
vector->isSrc = 0;
- insn->extra.printfSize = static_cast<uint16_t>(totalSize);
- insn->extra.continueFlag = isContinue;
- insn->extra.printfBTI = bti;
- insn->extra.printfNum = num;
+ vector = this->appendVector();
+ vector->regNum = 1;
+ vector->reg = &insn->dst(1);
+ vector->offsetID = 1;
+ vector->isSrc = 0;
} else {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum);
SelectionVector *vector = this->appendVector();
-
- for (int i = 0; i < srcNum; i++)
- insn->src(i) = src[i];
-
- insn->dst(0) = dst;
- insn->dst(1) = tmp0;
- insn->dst(2) = tmp1;
-
vector->regNum = 2;
- vector->reg = &insn->dst(1);
+ vector->reg = &insn->dst(0);
vector->offsetID = 0;
vector->isSrc = 0;
-
- insn->extra.printfSize = static_cast<uint16_t>(totalSize);
- insn->extra.continueFlag = isContinue;
- insn->extra.printfBTI = bti;
- insn->extra.printfNum = num;
}
+
+ insn->extra.printfSize = static_cast<uint16_t>(totalSize);
+ insn->extra.continueFlag = isContinue;
+ insn->extra.printfBTI = bti;
+ insn->extra.printfNum = num;
}
void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op,
@@ -1986,19 +2283,11 @@ namespace gbe
GenRegister tmpData2,
GenRegister slmOff,
vector<GenRegister> msg,
- uint32_t msgSizeReq,
GenRegister localBarrier)
{
SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 6);
- SelectionVector *vector = this->appendVector();
- /* allocate continuous GRF registers for READ/WRITE to SLM */
- GBE_ASSERT(msg.size() >= msgSizeReq);
- vector->regNum = msg.size();
- vector->offsetID = 0;
- vector->reg = &insn->dst(2);
- vector->isSrc = 0;
- insn->extra.workgroupOp = wg_op;
+ insn->extra.wgop.workgroupOp = wg_op;
insn->dst(0) = dst;
insn->dst(1) = tmpData1;
@@ -2011,6 +2300,29 @@ namespace gbe
insn->src(3) = tmpData2;
insn->src(4) = slmOff;
insn->src(5) = localBarrier;
+
+ if (hasSends()) {
+ insn->extra.wgop.splitSend = 1;
+ SelectionVector *vector = this->appendVector();
+
+ vector->regNum = 1;
+ vector->offsetID = 2;
+ vector->reg = &insn->dst(2);
+ vector->isSrc = 0;
+
+ vector = this->appendVector();
+ vector->regNum = msg.size() - 1;
+ vector->offsetID = 3;
+ vector->reg = &insn->dst(3);
+ vector->isSrc = 0;
+ } else {
+ /* allocate continuous GRF registers for READ/WRITE to SLM */
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = msg.size();
+ vector->offsetID = 2;
+ vector->reg = &insn->dst(2);
+ vector->isSrc = 0;
+ }
}
void Selection::Opaque::SUBGROUP_OP(uint32_t wg_op,
@@ -2021,7 +2333,7 @@ namespace gbe
{
SelectionInstruction *insn = this->appendInsn(SEL_OP_SUBGROUP_OP, 2, 2);
- insn->extra.workgroupOp = wg_op;
+ insn->extra.wgop.workgroupOp = wg_op;
insn->dst(0) = dst;
insn->dst(1) = tmpData1;
@@ -2031,119 +2343,115 @@ namespace gbe
}
void Selection::Opaque::OBREAD(GenRegister* dsts,
uint32_t vec_size,
- GenRegister addr,
GenRegister header,
uint32_t bti,
- GenRegister* tmp,
- uint32_t tmp_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+ uint32_t ow_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1);
SelectionVector *vector = this->appendVector();
- insn->dst(0) = header;
+ insn->src(0) = header;
for (uint32_t i = 0; i < vec_size; ++i)
- insn->dst(1 + i) = dsts[i];
- for (uint32_t i = 0; i < tmp_size; ++i)
- insn->dst(1 + i + vec_size) = tmp[i];
- insn->src(0) = addr;
+ insn->dst(i) = dsts[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // number of vector size
+ insn->extra.elem = ow_size; // number of OWord size
// tmp regs for OWORD read dst
- vector->regNum = tmp_size;
- vector->reg = &insn->dst(1 + vec_size);
- vector->offsetID = 1 + vec_size;
+ vector->regNum = vec_size;
+ vector->reg = &insn->dst(0);
+ vector->offsetID = 0;
vector->isSrc = 0;
}
- void Selection::Opaque::OBWRITE(GenRegister addr,
+ void Selection::Opaque::OBWRITE(GenRegister header,
GenRegister* values,
uint32_t vec_size,
- GenRegister header,
uint32_t bti,
- GenRegister* tmp,
- uint32_t tmp_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
- SelectionVector *vector = this->appendVector();
- insn->src(0) = addr;
+ uint32_t ow_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1);
+ insn->src(0) = header;
for (uint32_t i = 0; i < vec_size; ++i)
insn->src(i + 1) = values[i];
- insn->dst(0) = header;
- for (uint32_t i = 0; i < tmp_size; ++i)
- insn->dst(i + 1) = tmp[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // number of vector_size
+ insn->extra.elem = ow_size; // number of OWord_size
+
+ // For A64 write, we did not add sends support yet.
+ if (hasSends() && bti != 255) {
+ insn->extra.splitSend = 1;
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = 1;
+ vector->reg = &insn->src(0);
+ vector->offsetID = 0;
+ vector->isSrc = 1;
+
+ vector = this->appendVector();
+ vector->regNum = vec_size;
+ vector->reg = &insn->src(1);
+ vector->offsetID = 1;
+ vector->isSrc = 1;
+ } else {
+ // tmp regs for OWORD write header and values
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = vec_size + 1;
+ vector->reg = &insn->src(0);
+ vector->offsetID = 0;
+ vector->isSrc = 1;
+ }
- // tmp regs for OWORD read dst
- vector->regNum = tmp_size + 1;
- vector->reg = &insn->dst(0);
- vector->offsetID = 0;
- vector->isSrc = 0;
}
void Selection::Opaque::MBREAD(GenRegister* dsts,
- GenRegister coordx,
- GenRegister coordy,
+ uint32_t tmp_size,
GenRegister header,
- GenRegister* tmp,
uint32_t bti,
- uint32_t vec_size) {
+ uint32_t response_size) {
- uint32_t simdWidth = curr.execWidth;
- SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
-
- insn->dst(0) = header;
- for (uint32_t i = 0; i < vec_size; ++i) {
- insn->dst(i + 1) = dsts[i];
- if(simdWidth == 16)
- insn->dst(i + vec_size + 1) = tmp[i];
- }
- insn->src(0) = coordx;
- insn->src(1) = coordy;
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1);
+ insn->src(0) = header;
insn->setbti(bti);
- insn->extra.elem = vec_size; // vector size
+ insn->extra.elem = response_size; // send response length
- // Only in simd 8 the data is in vector form
- if(simdWidth == 8) {
- SelectionVector *vector = this->appendVector();
- vector->regNum = vec_size;
- vector->reg = &insn->dst(1);
- vector->offsetID = 1;
- vector->isSrc = 0;
- }
- if(simdWidth == 16)
- {
- SelectionVector *vectortmp = this->appendVector();
- vectortmp->regNum = vec_size;
- vectortmp->reg = &insn->dst(vec_size + 1);
- vectortmp->offsetID = vec_size + 1;
- vectortmp->isSrc = 0;
+ for (uint32_t i = 0; i < tmp_size; ++i) {
+ insn->dst(i) = dsts[i];
}
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = tmp_size;
+ vector->reg = &insn->dst(0);
+ vector->offsetID = 0;
+ vector->isSrc = 0;
}
- void Selection::Opaque::MBWRITE(GenRegister coordx,
- GenRegister coordy,
+ void Selection::Opaque::MBWRITE(GenRegister header,
GenRegister* values,
- GenRegister header,
- GenRegister* tmp,
+ uint32_t tmp_size,
uint32_t bti,
- uint32_t vec_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
- SelectionVector *vector = this->appendVector();
- insn->src(0) = coordx;
- insn->src(1) = coordy;
- for (uint32_t i = 0; i < vec_size; ++i)
- insn->src(2 + i) = values[i];
- insn->dst(0) = header;
- for (uint32_t i = 0; i < vec_size; ++i)
- insn->dst(1 + i) = tmp[i];
- insn->state = this->curr;
+ uint32_t data_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size);
+ insn->src(0) = header;
+ for (uint32_t i = 0; i < tmp_size; ++i)
+ insn->src(1 + i) = values[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // vector size
+ insn->extra.elem = data_size; // msg data part size
- // We need to put the header and the data together
- vector->regNum = 1 + vec_size;
- vector->reg = &insn->dst(0);
- vector->offsetID = 0;
- vector->isSrc = 0;
+ if (hasSends()) {
+ insn->extra.splitSend = 1;
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = 1;
+ vector->reg = &insn->src(0);
+ vector->offsetID = 0;
+ vector->isSrc = 1;
+
+ vector = this->appendVector();
+ vector->regNum = tmp_size;
+ vector->reg = &insn->src(1);
+ vector->offsetID = 1;
+ vector->isSrc = 1;
+ } else {
+ // We need to put the header and the data together
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = 1 + tmp_size;
+ vector->reg = &insn->src(0);
+ vector->offsetID = 0;
+ vector->isSrc = 1;
+ }
}
// Boiler plate to initialize the selection library at c++ pre-main
@@ -2364,7 +2672,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
this->push();
this->curr.predicate = GEN_PREDICATE_NORMAL;
this->curr.flag = 0;
- this->curr.subFlag = 0;
+ this->curr.subFlag = 1;
}
// If there is no branch at the end of this block.
@@ -2379,7 +2687,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
if(this->block->removeSimpleIfEndif){
this->curr.predicate = GEN_PREDICATE_NONE;
this->curr.flag = 0;
- this->curr.subFlag = 0;
+ this->curr.subFlag = 1;
this->pop();
}
// If we are in if/endif fix mode, and this block is
@@ -2389,13 +2697,14 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
this->block->insnList.size() != 0 &&
this->block->insnList.size() % 1000 == 0 &&
this->block->endifLabel.value() != 0) {
+ this->curr.flag = 0;
+ this->curr.subFlag = 1;
ir::LabelIndex jip = this->block->endifLabel;
this->ENDIF(GenRegister::immd(0), jip);
this->push();
this->curr.predicate = GEN_PREDICATE_NORMAL;
this->IF(GenRegister::immd(0), jip, jip);
this->pop();
- this->block->isLargeBlock = true;
}
// Output the code in the current basic block
this->endBackwardGeneration();
@@ -2524,6 +2833,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
this->opaque->setSlowByteGather(false);
this->opaque->setHasHalfType(true);
+ this->opaque->setHasSends(true);
opt_features = SIOF_LOGICAL_SRCMOD;
}
@@ -2545,6 +2855,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
this->opaque->setSlowByteGather(false);
this->opaque->setHasHalfType(true);
+ this->opaque->setHasSends(true);
opt_features = SIOF_LOGICAL_SRCMOD;
}
@@ -2553,7 +2864,6 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
uint32_t elemID = 0;
uint32_t i;
SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum);
- SelectionVector *msgVector = this->appendVector();;
for( i = 0; i < msgNum; ++i, ++elemID)
insn->src(elemID) = msgs[i];
@@ -2561,11 +2871,31 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
insn->setbti(bti);
insn->extra.msglen = msgNum;
insn->extra.is3DWrite = is3D;
- // Sends require contiguous allocation
- msgVector->regNum = msgNum;
- msgVector->isSrc = 1;
- msgVector->offsetID = 0;
- msgVector->reg = &insn->src(0);
+
+ if (hasSends()) {
+ assert(msgNum == 9);
+ insn->extra.typedWriteSplitSend = 1;
+ //header + coords
+ SelectionVector *msgVector = this->appendVector();
+ msgVector->regNum = 5;
+ msgVector->isSrc = 1;
+ msgVector->offsetID = 0;
+ msgVector->reg = &insn->src(0);
+
+ //data
+ msgVector = this->appendVector();
+ msgVector->regNum = 4;
+ msgVector->isSrc = 1;
+ msgVector->offsetID = 5;
+ msgVector->reg = &insn->src(5);
+ } else {
+ // Send require contiguous allocation
+ SelectionVector *msgVector = this->appendVector();
+ msgVector->regNum = msgNum;
+ msgVector->isSrc = 1;
+ msgVector->offsetID = 0;
+ msgVector->reg = &insn->src(0);
+ }
}
Selection::~Selection(void) { GBE_DELETE(this->opaque); }
@@ -2847,6 +3177,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
case ir::OP_FBL: sel.FBL(dst, src); break;
case ir::OP_CBIT: sel.CBIT(dst, src); break;
case ir::OP_LZD: sel.LZD(dst, src); break;
+ case ir::OP_BFREV: sel.BFREV(dst, src); break;
case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
@@ -3573,8 +3904,11 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
reg == ir::ocl::lid1 ||
reg == ir::ocl::lid2 ||
reg == ir::ocl::lsize0 ||
- reg == ir::ocl::lsize1||
- reg == ir::ocl::lsize2)
+ reg == ir::ocl::lsize1 ||
+ reg == ir::ocl::lsize2 ||
+ reg == ir::ocl::enqlsize0 ||
+ reg == ir::ocl::enqlsize1 ||
+ reg == ir::ocl::enqlsize2)
return true;
else
return false;
@@ -3799,6 +4133,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
return GEN_BYTE_SCATTER_BYTE;
}
}
+ ir::Register generateLocalMask(Selection::Opaque &sel, GenRegister addr) {
+ sel.push();
+ ir::Register localMask = sel.reg(ir::FAMILY_BOOL);
+ sel.curr.physicalFlag = 0;
+ sel.curr.modFlag = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.flagIndex = localMask;
+ sel.CMP(GEN_CONDITIONAL_L, addr, GenRegister::immud(64*1024));
+ sel.pop();
+ return localMask;
+ }
class LoadInstructionPattern : public SelectionPattern
{
@@ -3807,36 +4152,140 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
LoadInstructionPattern(void) : SelectionPattern(1, 1) {
this->opcodes.push_back(ir::OP_LOAD);
}
- void readDWord(Selection::Opaque &sel,
+ bool isReadConstantLegacy(const ir::LoadInstruction &load) const {
+ ir::AddressMode AM = load.getAddressMode();
+ ir::AddressSpace AS = load.getAddressSpace();
+ if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT)
+ return true;
+ return false;
+ }
+ void untypedReadStateless(Selection::Opaque &sel,
+ GenRegister addr,
+ vector<GenRegister> &dst
+ ) const {
+ using namespace ir;
+ GenRegister addrQ;
+ unsigned simdWidth = sel.curr.execWidth;
+ unsigned addrBytes = typeSize(addr.type);
+ unsigned valueNum = dst.size();
+ bool isUniform = sel.isScalarReg(dst[0].reg());
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, addr);
+ } else if (addrBytes == 8) {
+ addrQ = addr;
+ } else
+ NOT_IMPLEMENTED;
+
+ if (simdWidth == 8) {
+ sel.UNTYPED_READA64(addrQ, dst.data(), valueNum, valueNum);
+ } else if (simdWidth == 16) {
+ std::vector<GenRegister> tmpData;
+ for (unsigned i = 0; i < (valueNum+1)/2; i++) {
+ tmpData.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32));
+ }
+ sel.push();
+ /* first quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 0), tmpData.data(), (valueNum+1)/2, valueNum);
+
+ sel.push();
+ if (isUniform)
+ sel.curr.execWidth = 1;
+ for (unsigned k = 0; k < valueNum; k++) {
+ sel.MOV(GenRegister::Qn(dst[k], 0), GenRegister::Qn(tmpData[k/2], k%2));
+ }
+ sel.pop();
+
+ /* second quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 1), tmpData.data(), (valueNum+1)/2, valueNum);
+ if (isUniform)
+ sel.curr.execWidth = 1;
+ for (unsigned k = 0; k < valueNum; k++) {
+ sel.MOV(GenRegister::Qn(dst[k], 1), GenRegister::Qn(tmpData[k/2], k%2));
+ }
+ sel.pop();
+ }
+ }
+
+ void shootUntypedReadMsg(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
vector<GenRegister> &dst,
GenRegister addr,
uint32_t valueNum,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
- sel.UNTYPED_READ(addr, dst.data(), valueNum, b, sel.getBTITemps(bti));
+ using namespace ir;
+ unsigned addrBytes = typeSize(addr.type);
+ AddressMode AM = insn.getAddressMode();
+
+ /* Notes on uniform of LoadInstruction, all-lanes-active(noMask,noPredicate)
+ * property should only need be taken care when the value is UNIFORM, if the
+ * value is not uniform, just do things under predication or mask */
+ bool isUniform = sel.isScalarReg(dst[0].reg());
+ sel.push();
+ if (isUniform) {
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ }
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+
+ if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.UNTYPED_READ(addr, dst.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.UNTYPED_READ(addr, dst.data(), valueNum, GenRegister::immud(SI), btiTemp);
+ }
+ } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) ) {
+ // stateless mode, local/constant still use bti access
+ unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+ GenRegister addrDW = addr;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, addr);
+ sel.UNTYPED_READ(addrDW, dst.data(), valueNum, GenRegister::immud(bti), btiTemp);
+ } else if (addrSpace == ir::MEM_GENERIC) {
+ Register localMask = generateLocalMask(sel, addr);
+ sel.push();
+ sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+ GenRegister addrDW = addr;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, addr);
+ sel.UNTYPED_READ(addrDW, dst.data(), valueNum, GenRegister::immud(0xfe), btiTemp);
+
+ sel.curr.inversePredicate = 1;
+ untypedReadStateless(sel, addr, dst);
+ sel.pop();
+
+ } else {
+ untypedReadStateless(sel, addr, dst);
+ }
+ sel.pop();
}
void emitUntypedRead(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
vector<GenRegister> dst(valueNum);
for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
- readDWord(sel, dst, addr, valueNum, bti);
+ shootUntypedReadMsg(sel, insn, dst, addr, valueNum, addrSpace);
}
void emitDWordGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
- GBE_ASSERT(bti.isConst == 1);
GBE_ASSERT(insn.getValueNum() == 1);
const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
@@ -3844,7 +4293,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
sel.push();
sel.curr.noMask = 1;
- sel.SAMPLE(&dst, 1, &addr, 1, bti.imm, 0, true, true);
+ sel.SAMPLE(&dst, 1, &addr, 1, BTI_CONSTANT, 0, true, true);
sel.pop();
return;
}
@@ -3857,52 +4306,144 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
if (sel.isScalarReg(addr.reg())) {
sel.curr.noMask = 1;
}
- sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+ if (sel.getRegisterFamily(addr.reg()) == FAMILY_QWORD) {
+ // as we still use offset instead of absolut graphics address,
+ // it is safe to convert from u64 to u32
+ GenRegister t = convertU64ToU32(sel, addr);
+ sel.SHR(addrDW, t, GenRegister::immud(2));
+ } else
+ sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
sel.pop();
- sel.DWORD_GATHER(dst, addrDW, bti.imm);
+ sel.DWORD_GATHER(dst, addrDW, BTI_CONSTANT);
+ }
+
+ void read64Legacy(Selection::Opaque &sel,
+ GenRegister addr,
+ vector<GenRegister> &dst,
+ GenRegister bti,
+ vector<GenRegister> &btiTemp) const {
+ const uint32_t valueNum = dst.size();
+ if (sel.hasLongType()) {
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+
+ sel.READ64(addr, dst.data(), tmp.data(), valueNum, bti, true, btiTemp);
+ } else {
+ sel.READ64(addr, dst.data(), NULL, valueNum, bti, false, btiTemp);
+ }
+ }
+ void read64Stateless(Selection::Opaque &sel,
+ const GenRegister addr,
+ vector<GenRegister> dst) const {
+ using namespace ir;
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned valueNum = dst.size();
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+ unsigned addrBytes = typeSize(addr.type);
+ GenRegister addrQ;
+
+ sel.push();
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, addr);
+ } else {
+ addrQ = addr;
+ }
+
+ if (simdWidth == 8) {
+ sel.READ64A64(addrQ, dst.data(), tmp.data(), valueNum);
+ } else {
+ assert(valueNum == 1);
+ GenRegister tmpAddr, tmpDst;
+ tmpAddr = GenRegister::Qn(addrQ, 0);
+ tmpDst = GenRegister::Qn(dst[0], 0);
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum);
+
+ tmpAddr = GenRegister::Qn(addrQ, 1);
+ tmpDst = GenRegister::Qn(dst[0], 1);
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum);
+ }
+ sel.pop();
}
void emitRead64(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- GBE_ASSERT(bti.isConst == 1);
vector<GenRegister> dst(valueNum);
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
- if (sel.hasLongType()) {
- vector<GenRegister> tmp(valueNum);
- for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
- tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ bool isUniform = sel.isScalarReg(insn.getValue(0));
+ unsigned addrBytes = typeSize(addr.type);
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ sel.push();
+ if (isUniform) {
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
}
+ if (AM != AM_Stateless) {
+ GenRegister b;
+ if (AM == AM_DynamicBti) {
+ b = sel.selReg(insn.getBtiReg(), TYPE_U32);
+ } else {
+ b = GenRegister::immud(insn.getSurfaceIndex());
+ }
+ read64Legacy(sel, addr, dst, b, btiTemp);
+ } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) {
+ GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL? 0xfe : BTI_CONSTANT);
+ GenRegister addrDW = addr;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, addr);
+ read64Legacy(sel, addrDW, dst, b, btiTemp);
+ } else if (addrSpace == ir::MEM_GENERIC) {
+ Register localMask = generateLocalMask(sel, addr);
+ sel.push();
+ sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+ GenRegister addrDW = addr;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, addr);
+ read64Legacy(sel, addrDW, dst, GenRegister::immud(0xfe), btiTemp);
- sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
- } else {
- sel.READ64(addr, dst.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
- }
+ sel.curr.inversePredicate = 1;
+ read64Stateless(sel, addr, dst);
+ sel.pop();
+ } else {
+ read64Stateless(sel, addr, dst);
+ }
+ sel.pop();
}
void readByteAsDWord(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
const uint32_t elemSize,
GenRegister address,
GenRegister dst,
bool isUniform,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
+ RegisterFamily addrFamily = sel.getRegisterFamily(address.reg());
+ Type addrType = getType(addrFamily);
Register tmpReg = sel.reg(FAMILY_DWORD, isUniform);
- GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+ GenRegister tmpAddr = sel.selReg(sel.reg(addrFamily, isUniform), addrType);
GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
-
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+ GenRegister addrOffset = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
// Get dword aligned addr
sel.push();
@@ -3910,24 +4451,36 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.curr.noMask = 1;
sel.curr.execWidth = 1;
}
- sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+ if (addrFamily == FAMILY_DWORD)
+ sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+ else {
+ sel.MOV(tmpAddr, GenRegister::immuint64(0xfffffffffffffffc));
+ sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UL), tmpAddr);
+ }
+
sel.pop();
sel.push();
+ vector<GenRegister> tmp;
+ tmp.push_back(tmpData);
+ shootUntypedReadMsg(sel, insn, tmp, tmpAddr, 1, addrSpace);
if (isUniform)
sel.curr.noMask = 1;
- sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, sel.getBTITemps(bti));
if (isUniform)
sel.curr.execWidth = 1;
// Get the remaining offset from aligned addr
- sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
- sel.SHL(tmpAddr, tmpAddr, GenRegister::immud(0x3));
- sel.SHR(tmpData, tmpData, tmpAddr);
+ if (addrFamily == FAMILY_QWORD) {
+ sel.AND(addrOffset, sel.unpacked_ud(address.reg()), GenRegister::immud(0x3));
+ } else {
+ sel.AND(addrOffset, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+ }
+ sel.SHL(addrOffset, addrOffset, GenRegister::immud(0x3));
+ sel.SHR(tmpData, tmpData, addrOffset);
if (elemSize == GEN_BYTE_SCATTER_WORD)
- sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), GenRegister::unpacked_uw(tmpReg, isUniform, sel.isLongReg(tmpReg)));
else if (elemSize == GEN_BYTE_SCATTER_BYTE)
- sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), GenRegister::unpacked_ub(tmpReg, isUniform));
sel.pop();
}
@@ -3936,7 +4489,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
const ir::LoadInstruction &insn,
const uint32_t elemSize,
GenRegister address,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
@@ -3955,7 +4508,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
}
- readDWord(sel, tmp, address, tmpRegNum, bti);
+ shootUntypedReadMsg(sel, insn, tmp, address, tmpRegNum, addrSpace);
for(uint32_t i = 0; i < tmpRegNum; i++) {
unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
@@ -3976,6 +4529,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
using namespace ir;
GBE_ASSERT(effectData.size() == effectDataNum);
GBE_ASSERT(tmp.size() == effectDataNum + 1);
+ RegisterFamily addrFamily = sel.getRegisterFamily(address.reg());
sel.push();
Register alignedFlag = sel.reg(FAMILY_BOOL, isUniform);
GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
@@ -3984,7 +4538,12 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.push();
if (isUniform)
sel.curr.noMask = 1;
- sel.AND(shiftL, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(0x3));
+ if (addrFamily == FAMILY_QWORD) {
+ GenRegister t = convertU64ToU32(sel, address);
+ sel.AND(shiftL, t, GenRegister::immud(0x3));
+ } else {
+ sel.AND(shiftL, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+ }
sel.SHL(shiftL, shiftL, GenRegister::immud(0x3));
sel.ADD(shiftH, GenRegister::negate(shiftL), GenRegister::immud(32));
sel.curr.physicalFlag = 0;
@@ -4012,11 +4571,93 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.pop();
}
+ /* Used to transform address from 64bit to 32bit, note as dataport messages
+ * cannot accept scalar register, so here to convert to non-uniform
+ * register here. */
+ GenRegister convertU64ToU32(Selection::Opaque &sel,
+ GenRegister addr) const {
+ GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+ GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.MOV(dst, unpacked);
+ return dst;
+ }
+
+ void byteGatherStateless(Selection::Opaque &sel,
+ GenRegister addr,
+ GenRegister dst,
+ unsigned elemSize) const {
+ using namespace ir;
+ GenRegister addrQ;
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned addrBytes = typeSize(addr.type);
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, addr);
+ } else {
+ addrQ = addr;
+ }
+
+ sel.push();
+ if (simdWidth == 8) {
+ sel.BYTE_GATHERA64(dst, addrQ, elemSize);
+ } else if (simdWidth == 16) {
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.BYTE_GATHERA64(GenRegister::Qn(dst, 0), GenRegister::Qn(addrQ, 0), elemSize);
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.BYTE_GATHERA64(GenRegister::Qn(dst, 1), GenRegister::Qn(addrQ, 1), elemSize);
+ }
+ sel.pop();
+ }
+ void shootByteGatherMsg(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister dst,
+ GenRegister addr,
+ unsigned elemSize,
+ bool isUniform,
+ ir::AddressSpace addrSpace) const {
+ using namespace ir;
+ unsigned addrBytes = typeSize(addr.type);
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.BYTE_GATHER(dst, addr, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI), btiTemp);
+ }
+ } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
+ unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+ GenRegister addrDW = addr;
+ if (addrBytes == 8) {
+ addrDW = convertU64ToU32(sel, addr);
+ }
+
+ sel.BYTE_GATHER(dst, addrDW, elemSize, GenRegister::immud(bti), btiTemp);
+ } else if (addrSpace == ir::MEM_GENERIC) {
+ Register localMask = generateLocalMask(sel, addr);
+ sel.push();
+ sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+ GenRegister addrDW = addr;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, addr);
+ sel.BYTE_GATHER(dst, addrDW, elemSize, GenRegister::immud(0xfe), btiTemp);
+
+ sel.curr.inversePredicate = 1;
+ byteGatherStateless(sel, addr, dst, elemSize);
+ sel.pop();
+ } else {
+ byteGatherStateless(sel, addr, dst, elemSize);
+ }
+ }
+
void emitUnalignedByteGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
const uint32_t elemSize,
GenRegister address,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
@@ -4024,6 +4665,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
1 : sel.ctx.getSimdWidth();
const bool isUniform = simdWidth == 1;
RegisterFamily family = getFamily(insn.getValueType());
+ RegisterFamily addrFamily = sel.getRegisterFamily(address.reg());
+ Type addrType = getType(addrFamily);
if(valueNum > 1) {
GBE_ASSERT(!isUniform && "vector load should not be uniform. Something went wrong.");
@@ -4040,16 +4683,20 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
vector<GenRegister> tmp(effectDataNum + 1);
- vector<GenRegister> tmp2(effectDataNum + 1);
vector<GenRegister> effectData(effectDataNum);
for(uint32_t i = 0; i < effectDataNum + 1; i++)
- tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
- GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+ GenRegister alignedAddr = sel.selReg(sel.reg(addrFamily, isUniform), addrType);
sel.push();
if (isUniform)
sel.curr.noMask = 1;
- sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
+ if (addrFamily == FAMILY_DWORD)
+ sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
+ else {
+ sel.MOV(alignedAddr, GenRegister::immuint64(~0x3ul));
+ sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UL), alignedAddr);
+ }
sel.pop();
uint32_t remainedReg = effectDataNum + 1;
@@ -4057,15 +4704,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
do {
uint32_t width = remainedReg > 4 ? 4 : remainedReg;
vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
- vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
if (pos != 0) {
sel.push();
- if (isUniform)
- sel.curr.noMask = 1;
+ if (isUniform)
+ sel.curr.noMask = 1;
+ if (addrFamily == FAMILY_DWORD)
sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
+ else
+ sel.ADD(alignedAddr, alignedAddr, GenRegister::immuint64(pos * 4));
sel.pop();
}
- readDWord(sel, t1, alignedAddr, width, bti);
+ shootUntypedReadMsg(sel, insn, t1, alignedAddr, width, addrSpace);
remainedReg -= width;
pos += width;
} while(remainedReg);
@@ -4082,7 +4731,6 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
}
} else {
GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_BYTE);
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
vector<GenRegister> dst(valueNum);
for(uint32_t i = 0; i < valueNum; i++)
dst[i] = sel.selReg(insn.getValue(i), getType(family));
@@ -4106,7 +4754,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
dataSize = GEN_BYTE_SCATTER_WORD;
else
dataSize = GEN_BYTE_SCATTER_DWORD;
- sel.BYTE_GATHER(readDst, addressForLoop, dataSize, b, sel.getBTITemps(bti));
+ shootByteGatherMsg(sel, insn, readDst, addressForLoop, dataSize, isUniform, addrSpace);
// only 4 bytes is gathered even if valueLeft >= 4
sel.UNPACK_BYTE(dst.data(), readDst, getFamilySize(FAMILY_BYTE), (valueLeft < 4 ? valueLeft : 4));
@@ -4122,23 +4770,22 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
GBE_ASSERT(insn.getValueNum() == 1);
const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
- if(sel.getSlowByteGather())
- readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+ if (sel.getSlowByteGather())
+ readByteAsDWord(sel, insn, elemSize, address, value, isUniform, addrSpace);
else {
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-
// We need a temporary register if we read bytes or words
- Register dst = sel.reg(FAMILY_DWORD, isUniform);
+ Register dst = sel.reg(FAMILY_DWORD);
sel.push();
if (isUniform)
sel.curr.noMask = 1;
- sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, sel.getBTITemps(bti));
+ shootByteGatherMsg(sel, insn, sel.selReg(dst, ir::TYPE_U32), address, elemSize, isUniform, addrSpace);
sel.pop();
sel.push();
if (isUniform) {
sel.curr.noMask = 1;
sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
}
if (elemSize == GEN_BYTE_SCATTER_WORD)
sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst, isUniform));
@@ -4152,22 +4799,87 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
void emitOWordRead(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister address,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
+ uint32_t SI = insn.getSurfaceIndex();
const uint32_t vec_size = insn.getValueNum();
const uint32_t simdWidth = sel.ctx.getSimdWidth();
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ const Type type = insn.getValueType();
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ bool isA64 = SI == 255;
+
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
vector<GenRegister> valuesVec;
- for(uint32_t i = 0; i < vec_size; i++)
- valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
- // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size / 8;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
vector<GenRegister> tmpVec;
+ for(uint32_t i = 0; i < vec_size; i++)
+ valuesVec.push_back(sel.selReg(insn.getValue(i), type));
+
+ GenRegister headeraddr;
+ if (isA64)
+ headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 2 * 4), GEN_TYPE_UD);
+ // Make header
+ sel.push();
+ {
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the current address
+ sel.curr.execWidth = 1;
+
+ // Put zero in the general state base address
+ if (isA64)
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+ else {
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD));
+ sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+ }
+ }
+ sel.pop();
+
+ /* For block read we need to unpack the block date into values, and for different
+ * simdwidth and vector size with different type size, we may need to spilt the
+ * block read send message.
+ * We can only get a send message with 5 reg length
+ * so for different combination we have different message length and tmp vector size
+ * | simd8 | simd16 | simd8 | simd16
+ * r0 |header | | | |
+ * r1 |date | w0,w1 | w0 | dw0 | dw0
+ * r2 |date | w2,w3 | w1 | dw1 | dw0
+ * r3 |date | ...... | ...... | ...... | dw1
+ * r4 |date | ....... | ...... | ...... | dw1
+ */
+
+ uint32_t totalSize = simdWidth * typeSize * vec_size;
+ uint32_t valueSize = simdWidth * typeSize;
+ uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+ uint32_t msg_num = vec_size / tmp_size;
+ uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
- sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, &tmpVec[0], tmp_size);
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ for (uint32_t i = 0; i < msg_num; i++) {
+ if (i > 0) {
+ sel.push();
+ {
+ // Update the address in header
+ sel.curr.execWidth = 1;
+ sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
+ }
+ sel.pop();
+ }
+ sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
+ for (uint32_t j = 0; j < tmp_size; j++)
+ sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
+ }
+
}
// check whether all binded table index point to constant memory
@@ -4182,59 +4894,51 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
{
using namespace ir;
const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
- GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
+ Register reg = insn.getAddressRegister();
+ GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg)));
GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
insn.getAddressSpace() == MEM_CONSTANT ||
insn.getAddressSpace() == MEM_PRIVATE ||
insn.getAddressSpace() == MEM_LOCAL ||
+ insn.getAddressSpace() == MEM_GENERIC ||
insn.getAddressSpace() == MEM_MIXED);
//GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
- BTI bti;
- AddressMode am = insn.getAddressMode();
- if (am == AM_StaticBti) {
- bti.isConst = 1;
- bti.imm = insn.getSurfaceIndex();
- } else if (am == AM_DynamicBti) {
- bti.isConst = 0;
- bti.reg = insn.getBtiReg();
- } else {
- assert(0 && "stateless not supported yet");
- }
+ AddressSpace addrSpace = insn.getAddressSpace();
+
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(sel, type);
- bool allConstant = isAllConstant(bti);
if (insn.isBlock())
- this->emitOWordRead(sel, insn, address, bti);
- else if (allConstant) {
+ this->emitOWordRead(sel, insn, address, addrSpace);
+ else if (isReadConstantLegacy(insn)) {
// XXX TODO read 64bit constant through constant cache
// Per HW Spec, constant cache messages can read at least DWORD data.
// So, byte/short data type, we have to read through data cache.
if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitRead64(sel, insn, address, bti);
+ this->emitRead64(sel, insn, address, addrSpace);
else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
- this->emitDWordGather(sel, insn, address, bti);
+ this->emitDWordGather(sel, insn, address, addrSpace);
else if (insn.isAligned() == true)
- this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace);
else
- this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace);
} else {
if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitRead64(sel, insn, address, bti);
+ this->emitRead64(sel, insn, address, addrSpace);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
- this->emitUntypedRead(sel, insn, address, bti);
+ this->emitUntypedRead(sel, insn, address, addrSpace);
else if (insn.isAligned())
- this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace);
else
- this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace);
}
markAllChildren(dag);
+
return true;
}
};
-
class StoreInstructionPattern : public SelectionPattern
{
public:
@@ -4242,44 +4946,316 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
StoreInstructionPattern(void) : SelectionPattern(1, 1) {
this->opcodes.push_back(ir::OP_STORE);
}
- void emitUntypedWrite(Selection::Opaque &sel,
- const ir::StoreInstruction &insn,
+ GenRegister convertU64ToU32(Selection::Opaque &sel,
+ GenRegister addr) const {
+ GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+ GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.MOV(dst, unpacked);
+ return dst;
+ }
+
+ void untypedWriteStateless(Selection::Opaque &sel,
+ GenRegister address,
+ vector<GenRegister> &value) const
+ {
+ using namespace ir;
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned int addrBytes = typeSize(address.type);
+ unsigned valueNum = value.size();
+ GenRegister addrQ;
+ if (addrBytes == 4) {
+ if (simdWidth == 8) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, address);
+ } else if (simdWidth == 16) {
+ addrQ = address;
+ }
+ } else if (addrBytes == 8) {
+ addrQ = address;
+ }
+
+ if (simdWidth == 8) {
+ vector<GenRegister> msg;
+ msg.push_back(addrQ);
+ for (unsigned k = 0; k < valueNum; k++)
+ msg.push_back(value[k]);
+
+ sel.UNTYPED_WRITEA64(msg.data(), valueNum+1, valueNum);
+ } else if (simdWidth == 16) {
+ vector<GenRegister> msgs;
+ for (unsigned k = 0; k < (valueNum+1)/2+1; k++) {
+ msgs.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32));
+ }
+ sel.push();
+ /* do first quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+ for (unsigned k = 0; k < valueNum; k++) {
+ sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 0));
+ }
+ sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum);
+
+ /* do second quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+ for (unsigned k = 0; k < valueNum; k++)
+ sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 1));
+ sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum);
+ sel.pop();
+ }
+ }
+
+ void shootUntypedWriteMsg(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister &address,
+ vector<GenRegister> &value,
+ ir::AddressSpace addrSpace) const
+ {
+ using namespace ir;
+ unsigned int addrBytes = typeSize(address.type);
+ unsigned valueNum = value.size();
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+
+ if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.UNTYPED_WRITE(address, value.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.UNTYPED_WRITE(address, value.data(), valueNum, GenRegister::immud(SI), btiTemp);
+ }
+ } else if (addrSpace == ir::MEM_LOCAL) {
+ GenRegister addr = address;
+ if (addrBytes == 8) {
+ addr = convertU64ToU32(sel, address);
+ }
+ sel.UNTYPED_WRITE(addr, value.data(), valueNum, GenRegister::immud(0xfe), btiTemp);
+ } else if (addrSpace == ir::MEM_GENERIC) {
+ Register localMask = generateLocalMask(sel, address);
+ sel.push();
+ sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+ GenRegister addrDW = address;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, address);
+ sel.UNTYPED_WRITE(addrDW, value.data(), valueNum, GenRegister::immud(0xfe), btiTemp);
+
+ sel.curr.inversePredicate = 1;
+ untypedWriteStateless(sel, address, value);
+ sel.pop();
+ } else {
+ untypedWriteStateless(sel, address, value);
+ }
+ }
+
+ void emitUntypedWrite(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister address,
+ ir::AddressSpace addrSpace) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ vector<GenRegister> value(valueNum);
+
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+ value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
+
+ shootUntypedWriteMsg(sel, insn, address, value, addrSpace);
+ }
+
+ void write64Legacy(Selection::Opaque &sel,
+ GenRegister address,
+ vector<GenRegister> &value,
+ GenRegister bti,
+ vector<GenRegister> &btiTemp) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = value.size();
+ if (sel.hasLongType()) {
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+ sel.WRITE64(address, value.data(), tmp.data(), valueNum, bti, true, btiTemp);
+ } else {
+ sel.WRITE64(address, value.data(), NULL, valueNum, bti, false, btiTemp);
+ }
+ }
+
+ void write64Stateless(Selection::Opaque &sel,
GenRegister address,
- ir::BTI &bti) const
+ vector<GenRegister> &value) const
{
using namespace ir;
- const uint32_t valueNum = insn.getValueNum();
- vector<GenRegister> value(valueNum), tmps;
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned int addrBytes = typeSize(address.type);
+ unsigned valueNum = value.size();
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+ GenRegister addrQ;
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, address);
+ } else {
+ addrQ = address;
+ }
- for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
- value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
- sel.UNTYPED_WRITE(address, value.data(), valueNum, b, sel.getBTITemps(bti));
- }
+ sel.push();
+ if (simdWidth == 8) {
+ sel.WRITE64A64(addrQ, value.data(), tmp.data(), valueNum);
+ } else {
+ GenRegister tmpAddr, tmpSrc;
+ tmpAddr = GenRegister::Qn(addrQ, 0);
+ tmpSrc = GenRegister::Qn(value[0], 0);
+ GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+
+ /* SIMD16 long register is just enough for (SIMD8 A64 addr + SIMD8 long) */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr);
+ sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc);
+ sel.UNTYPED_WRITEA64(&tmp, 1, 2);
+ tmpAddr = GenRegister::Qn(addrQ, 1);
+ tmpSrc = GenRegister::Qn(value[0], 1);
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr);
+ sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc);
+ sel.UNTYPED_WRITEA64(&tmp, 1, 2);
+ }
+ sel.pop();
+ }
void emitWrite64(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
GenRegister address,
- ir::BTI &bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
vector<GenRegister> src(valueNum);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
- if (sel.hasLongType()) {
- vector<GenRegister> tmp(valueNum);
- for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
- tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ AddressMode AM = insn.getAddressMode();
+ unsigned int addrBytes = typeSize(address.type);
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ if (AM != AM_Stateless) {
+ GenRegister b;
+ if (AM == AM_DynamicBti) {
+ b = sel.selReg(insn.getBtiReg(), TYPE_U32);
+ } else {
+ b = GenRegister::immud(insn.getSurfaceIndex());
+ }
+ write64Legacy(sel, address, src, b, btiTemp);
+ } else if (addrSpace == MEM_LOCAL) {
+ GenRegister b = GenRegister::immud(0xfe);
+ GenRegister addr = address;
+ if (addrBytes == 8) {
+ addr = convertU64ToU32(sel, address);
+ }
+ write64Legacy(sel, addr, src, b, btiTemp);
+ } else if (addrSpace == ir::MEM_GENERIC) {
+ Register localMask = generateLocalMask(sel, address);
+ sel.push();
+ sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+ GenRegister addrDW = address;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, address);
+ write64Legacy(sel, addrDW, src, GenRegister::immud(0xfe), btiTemp);
+
+ sel.curr.inversePredicate = 1;
+ write64Stateless(sel, address, src);
+ sel.pop();
+ } else {
+ GBE_ASSERT(sel.hasLongType());
+ write64Stateless(sel, address, src);
+ }
+ }
+
+ void byteScatterStateless(Selection::Opaque &sel,
+ GenRegister address,
+ GenRegister data,
+ unsigned elemSize) const {
+ using namespace ir;
+ unsigned addrBytes = typeSize(address.type);
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ GenRegister addrQ;
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, address);
+ } else {
+ addrQ = address;
+ }
+ if (simdWidth == 8) {
+ GenRegister msg[2];
+ msg[0] = addrQ;
+ msg[1] = data;
+ sel.BYTE_SCATTERA64(msg, 2, elemSize);
+ } else if (simdWidth == 16) {
+ GenRegister msgs[2];
+ msgs[0] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ msgs[1] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.push();
+ sel.curr.execWidth = 8;
+ /* do first quarter */
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 0));
+ sel.BYTE_SCATTERA64(msgs, 2, elemSize);
+ /* do second quarter */
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 1));
+ sel.BYTE_SCATTERA64(msgs, 2, elemSize);
+ sel.pop();
+ }
+ }
+ void shootByteScatterMsg(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister address,
+ GenRegister data,
+ unsigned elemSize,
+ ir::AddressSpace addrSpace) const
+ {
+ using namespace ir;
+ unsigned addrBytes = typeSize(address.type);
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ if (AM != AM_Stateless) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.BYTE_SCATTER(address, data, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.BYTE_SCATTER(address, data, elemSize, GenRegister::immud(SI), btiTemp);
}
- sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
+ } else if (addrSpace == ir::MEM_LOCAL) {
+ GenRegister addr = address;
+ if (addrBytes == 8) {
+ addr = convertU64ToU32(sel, address);
+ }
+ sel.BYTE_SCATTER(addr, data, elemSize, GenRegister::immud(0xfe), btiTemp);
+ } else if (addrSpace == ir::MEM_GENERIC) {
+ Register localMask = generateLocalMask(sel, address);
+ sel.push();
+ sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+ GenRegister addrDW = address;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, address);
+ sel.BYTE_SCATTER(addrDW, data, elemSize, GenRegister::immud(0xfe), btiTemp);
+
+ sel.curr.inversePredicate = 1;
+ byteScatterStateless(sel, address, data, elemSize);
+ sel.pop();
} else {
- sel.WRITE64(address, src.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
+ byteScatterStateless(sel, address, data, elemSize);
}
}
@@ -4287,13 +5263,12 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
const ir::StoreInstruction &insn,
const uint32_t elemSize,
GenRegister address,
- ir::BTI &bti,
+ ir::AddressSpace addrSpace,
bool isUniform) const
{
using namespace ir;
uint32_t valueNum = insn.getValueNum();
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
if(valueNum > 1) {
const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
vector<GenRegister> value(valueNum);
@@ -4313,77 +5288,136 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
}
- sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, sel.getBTITemps(bti));
+ shootUntypedWriteMsg(sel, insn, address, tmp, addrSpace);
} else {
const GenRegister value = sel.selReg(insn.getValue(0));
GBE_ASSERT(insn.getValueNum() == 1);
- const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+ const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
- sel.push();
- if (isUniform) {
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
- }
+ if (elemSize == GEN_BYTE_SCATTER_WORD)
+ sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
+ else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+ sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
- if (elemSize == GEN_BYTE_SCATTER_WORD)
- sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
- else if (elemSize == GEN_BYTE_SCATTER_BYTE)
- sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
- sel.pop();
- sel.BYTE_SCATTER(address, tmp, elemSize, b, sel.getBTITemps(bti));
+ shootByteScatterMsg(sel, insn, address, tmp, elemSize, addrSpace);
}
}
void emitOWordWrite(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
GenRegister address,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
+ uint32_t SI = insn.getSurfaceIndex();
const uint32_t vec_size = insn.getValueNum();
const uint32_t simdWidth = sel.ctx.getSimdWidth();
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ const Type type = insn.getValueType();
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ bool isA64 = SI == 255;
+ uint32_t offset_size = isA64 ? 128 : 8;
+
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
vector<GenRegister> valuesVec;
- for(uint32_t i = 0; i < vec_size; i++)
- valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
- // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size / 8;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
vector<GenRegister> tmpVec;
+ for(uint32_t i = 0; i < vec_size; i++)
+ valuesVec.push_back(sel.selReg(insn.getValue(i), type));
+
+ GenRegister headeraddr;
+ if (isA64)
+ headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 2 * 4), GEN_TYPE_UD);
+ // Make header
+ sel.push();
+ {
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the current address
+ sel.curr.execWidth = 1;
+
+ // Put zero in the general state base address
+ if (isA64)
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+ else {
+ sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4));
+ sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+ }
+ }
+ sel.pop();
+
+ /* For block write we need to pack the block date into the tmp, and for different
+ * simdwidth and vector size with different type size, we may need to spilt the
+ * block write send message.
+ * We can only get a send message with 5 reg length
+ * so for different combination we have different message length and tmp vector size
+ * | simd8 | simd16 | simd8 | simd16
+ * r0 |header | | | |
+ * r1 |date | w0,w1 | w0 | dw0 | dw0
+ * r2 |date | w2,w3 | w1 | dw1 | dw0
+ * r3 |date | ...... | ...... | ...... | dw1
+ * r4 |date | ....... | ...... | ...... | dw1
+ */
+
+ uint32_t totalSize = simdWidth * typeSize * vec_size;
+ uint32_t valueSize = simdWidth * typeSize;
+ uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+ uint32_t msg_num = vec_size / tmp_size;
+ uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
- sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, &tmpVec[0], tmp_size);
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ for (uint32_t i = 0; i < msg_num; i++) {
+ for (uint32_t j = 0; j < tmp_size; j++)
+ sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
+ if (i > 0) {
+ sel.push();
+ {
+ // Update the address in header
+ sel.curr.execWidth = 1;
+ sel.ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
+ }
+ sel.pop();
+ }
+ sel.push();
+ // In simd8 mode, when data reg has more than 1 reg, execWidth 8 will get wrong
+ // result, so set the execWidth to 16.
+ sel.curr.execWidth = 16;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
+ sel.pop();
+ }
+
+
}
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
- GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
+ Register reg = insn.getAddressRegister();
+ GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg)));
+ AddressSpace addrSpace = insn.getAddressSpace();
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(sel, type);
const bool isUniform = sel.isScalarReg(insn.getAddressRegister()) && sel.isScalarReg(insn.getValue(0));
- BTI bti;
- AddressMode am = insn.getAddressMode();
- if (am == AM_StaticBti) {
- bti.isConst = 1;
- bti.imm = insn.getSurfaceIndex();
- } else if (am == AM_DynamicBti) {
- bti.isConst = 0;
- bti.reg = insn.getBtiReg();
- } else {
- assert(0 && "stateless not supported yet");
- }
if (insn.isBlock())
- this->emitOWordWrite(sel, insn, address, bti);
+ this->emitOWordWrite(sel, insn, address, addrSpace);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitWrite64(sel, insn, address, bti);
+ this->emitWrite64(sel, insn, address, addrSpace);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
- this->emitUntypedWrite(sel, insn, address, bti);
+ this->emitUntypedWrite(sel, insn, address, addrSpace);
else {
- this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
+ this->emitByteScatter(sel, insn, elemSize, address, addrSpace, isUniform);
}
markAllChildren(dag);
@@ -4416,9 +5450,13 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
if (liveOut.contains(dst) || dag.computeBool)
needStoreBool = true;
+ // why we set the tmpDst to null?
+ // because for the listed type compare instruction could not
+ // generate bool(uw) result to grf directly, we need an extra
+ // select to generate the bool value to grf
if(type == TYPE_S64 || type == TYPE_U64 ||
type == TYPE_DOUBLE || type == TYPE_FLOAT ||
- type == TYPE_U32 || type == TYPE_S32 /*||
+ type == TYPE_U32 || type == TYPE_S32 || type == TYPE_HALF /*||
(!needStoreBool)*/)
tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_F);
else
@@ -4451,7 +5489,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
} else {
if((type == TYPE_S64 || type == TYPE_U64 ||
type == TYPE_DOUBLE || type == TYPE_FLOAT ||
- type == TYPE_U32 || type == TYPE_S32))
+ type == TYPE_U32 || type == TYPE_S32 || type == TYPE_HALF))
sel.curr.flagGen = 1;
else if (sel.isScalarReg(dst)) {
// If the dest reg is a scalar bool, we can't set it as
@@ -5456,34 +6494,154 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
this->opcodes.push_back(ir::Opcode(op));
}
+ /* Used to transform address from 64bit to 32bit, note as dataport messages
+ * cannot accept scalar register, so here to convert to non-uniform
+ * register here. */
+ GenRegister convertU64ToU32(Selection::Opaque &sel,
+ GenRegister addr) const {
+ GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+ GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.MOV(dst, unpacked);
+ return dst;
+ }
+
+ void untypedAtomicA64Stateless(Selection::Opaque &sel,
+ const ir::AtomicInstruction &insn,
+ unsigned msgPayload,
+ GenRegister dst,
+ GenRegister addr,
+ GenRegister src1,
+ GenRegister src2,
+ GenRegister bti) const {
+ using namespace ir;
+ GenRegister addrQ;
+ const AtomicOps atomicOp = insn.getAtomicOpcode();
+ GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+ unsigned addrBytes = typeSize(addr.type);
+ GBE_ASSERT(msgPayload <= 3);
+
+ unsigned simdWidth = sel.curr.execWidth;
+ AddressMode AM = insn.getAddressMode();
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, addr);
+ } else {
+ addrQ = addr;
+ }
+
+ if (simdWidth == 8) {
+ vector<GenRegister> msgs;
+ msgs.push_back(addr);
+ msgs.push_back(src1);
+ msgs.push_back(src2);
+ sel.ATOMICA64(dst, genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM));
+ } else if (simdWidth == 16) {
+ vector<GenRegister> msgs;
+ RegisterFamily family = sel.getRegisterFamily(insn.getDst(0));
+ Type type = getType(family);
+ for (unsigned k = 0; k < msgPayload; k++) {
+ msgs.push_back(sel.selReg(sel.reg(family), type));
+ }
+ sel.push();
+ /* first quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+ if(msgPayload > 1) {
+ if(family == ir::FAMILY_QWORD)
+ sel.MOV(GenRegister::Qn(msgs[0], 1), GenRegister::Qn(src1, 0));
+ else
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 0));
+ }
+ if(msgPayload > 2) {
+ if(family == ir::FAMILY_QWORD)
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src2, 0));
+ else
+ sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 0));
+ }
+ sel.ATOMICA64(GenRegister::Qn(dst, 0), genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM));
+
+ /* second quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+ if(msgPayload > 1) {
+ if(family == ir::FAMILY_QWORD)
+ sel.MOV(GenRegister::Qn(msgs[0], 1), GenRegister::Qn(src1, 1));
+ else
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 1));
+ }
+ if(msgPayload > 2) {
+ if(family == ir::FAMILY_QWORD)
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src2, 1));
+ else
+ sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 1));
+ }
+ sel.ATOMICA64(GenRegister::Qn(dst, 1), genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM));
+ sel.pop();
+ }
+ }
+
INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
using namespace ir;
const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
- ir::BTI b;
const AtomicOps atomicOp = insn.getAtomicOpcode();
unsigned srcNum = insn.getSrcNum();
unsigned msgPayload;
+ Register reg = insn.getAddressRegister();
+ GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg)));
+ AddressSpace addrSpace = insn.getAddressSpace();
+ GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
+ insn.getAddressSpace() == MEM_PRIVATE ||
+ insn.getAddressSpace() == MEM_LOCAL ||
+ insn.getAddressSpace() == MEM_GENERIC ||
+ insn.getAddressSpace() == MEM_MIXED);
+ unsigned addrBytes = typeSize(address.type);
AddressMode AM = insn.getAddressMode();
if (AM == AM_DynamicBti) {
- b.reg = insn.getBtiReg();
msgPayload = srcNum - 1;
} else {
- b.imm = insn.getSurfaceIndex();
- b.isConst = 1;
msgPayload = srcNum;
}
- GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
- GenRegister bti = b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
- GenRegister src0 = sel.selReg(insn.getAddressRegister(), TYPE_U32);
+ Type type = getType(sel.getRegisterFamily(insn.getDst(0)));
+ GenRegister dst = sel.selReg(insn.getDst(0), type);
+ GenRegister src0 = sel.selReg(insn.getAddressRegister(), type);
GenRegister src1 = src0, src2 = src0;
- if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
- if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
+ if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), type);
+ if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), type);
GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
- sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti, sel.getBTITemps(b));
+ if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.ATOMIC(dst, genAtomicOp, msgPayload, address, src1, src2, sel.selReg(btiReg, type), sel.getBTITemps(AM));
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.ATOMIC(dst, genAtomicOp, msgPayload, address, src1, src2, GenRegister::immud(SI), sel.getBTITemps(AM));
+ }
+ } else if (addrSpace == ir::MEM_LOCAL) {
+ // stateless mode, local still use bti access
+ GenRegister addrDW = address;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, address);
+ sel.ATOMIC(dst, genAtomicOp, msgPayload, addrDW, src1, src2, GenRegister::immud(0xfe), sel.getBTITemps(AM));
+ } else if (addrSpace == ir::MEM_GENERIC) {
+ Register localMask = generateLocalMask(sel, address);
+ sel.push();
+ sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+ GenRegister addrDW = address;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, address);
+ sel.ATOMIC(dst, genAtomicOp, msgPayload, addrDW, src1, src2, GenRegister::immud(0xfe), sel.getBTITemps(AM));
+
+ sel.curr.inversePredicate = 1;
+ untypedAtomicA64Stateless(sel, insn, msgPayload, dst, address, src1, src2, GenRegister::immud(0xff));
+ sel.pop();
+ } else
+ untypedAtomicA64Stateless(sel, insn, msgPayload, dst, address, src1, src2, GenRegister::immud(0xff));
markAllChildren(dag);
return true;
@@ -5639,6 +6797,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.push();
sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
sel.cmpBlockIP(GEN_CONDITIONAL_LE, src0, src1);
sel.pop();
@@ -5649,6 +6809,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
// this block, as it will always excute with all lanes activated.
sel.push();
sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
sel.setBlockIP(src0, sel.ctx.getMaxLabel());
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.noMask = 1;
@@ -5667,6 +6829,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
// FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
sel.push();
sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
sel.setBlockIP(src0, label.value());
sel.pop();
}
@@ -5678,6 +6842,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
(jip != nextLabel || sel.block->endifOffset != -1)) {
// If it is required, insert a JUMP to bypass the block
sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
else if (simdWidth == 16)
@@ -5692,6 +6858,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
}
if(!sel.block->removeSimpleIfEndif){
sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
sel.curr.predicate = GEN_PREDICATE_NORMAL;
if(!insn.getParent()->needEndif && insn.getParent()->needIf) {
ir::LabelIndex label = insn.getParent()->endifLabel;
@@ -5831,86 +6999,97 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
{
INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren) const
{
- using namespace ir;
- const uint32_t simdWidth = sel.ctx.getSimdWidth();
- GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
- const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
- const uint32_t dim = insn.getSrcNum() - 4;
-
- if (simdWidth == 16) {
- for(uint32_t i = 0; i < msgNum; i++)
- msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- } else {
- uint32_t valueID = 0;
- uint32_t msgID = 0;
- msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- for(; msgID < 1 + dim; msgID++, valueID++)
- msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
-
- // fake v.
- if (dim < 2)
- msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- // fake w.
- if (dim < 3)
- msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- // LOD.
- msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- for(; valueID < insn.getSrcNum(); msgID++, valueID++)
- msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
- }
-
+ const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG));
sel.push();
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.noMask = 1;
- sel.MOV(msgs[0], GenRegister::immud(0));
+ sel.MOV(header, GenRegister::immud(0));
sel.curr.execWidth = 1;
-
- GenRegister channelEn = sel.getOffsetReg(msgs[0], 0, 7*4);
+ GenRegister channelEn = sel.getOffsetReg(header, 0, 7*4);
// Enable all channels.
sel.MOV(channelEn, GenRegister::immud(0xffff));
- sel.curr.execWidth = 8;
- // Set zero LOD.
- if (simdWidth == 8)
- sel.MOV(msgs[4], GenRegister::immud(0));
- else
- sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
sel.pop();
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ if (simdWidth == 16)
+ emitWithSimd16(sel, insn, markChildren, header);
+ else if (simdWidth == 8)
+ emitWithSimd8(sel, insn, markChildren, header);
+ else
+ NOT_SUPPORTED;
+ return true;
+ }
+
+ INLINE bool emitWithSimd16(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren, const GenRegister& header) const
+ {
+ using namespace ir;
+
+ GenRegister msgs[9]; // (header + U + V + W + LOD + 4)
+ msgs[0] = header;
+ for (uint32_t i = 1; i < 9; ++i) {
+ //SIMD16 will be split into two SIMD8,
+ //each virtual reg in msgs requires one physical reg with 8 DWORDs (32 bytes),
+ //so, declare with FAMILY_WORD, and the allocated size will be sizeof(WORD)*SIMD16 = 32 bytes
+ msgs[i] = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U32);
+ }
+
+ const uint32_t dims = insn.getSrcNum() - 4;
uint32_t bti = insn.getImageIndex();
- if (simdWidth == 8)
- sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
- else {
- sel.push();
- sel.curr.execWidth = 8;
- for( uint32_t quarter = 0; quarter < 2; quarter++)
- {
- #define QUARTER_MOV0(msgs, msgid, src) \
- sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \
- GenRegister::Qn(src, quarter))
-
- #define QUARTER_MOV1(msgs, msgid, src) \
- sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \
- GenRegister::Qn(src, quarter))
- sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
- // Set U,V,W
- QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
- if (dim > 1)
- QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
- if (dim > 2)
- QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
- // Set R, G, B, A
- QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(dim), insn.getSrcType()));
- QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(dim + 1), insn.getSrcType()));
- QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(dim + 2), insn.getSrcType()));
- QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(dim + 3), insn.getSrcType()));
- sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
- #undef QUARTER_MOV0
- #undef QUARTER_MOV1
+
+ sel.push();
+ sel.curr.execWidth = 8;
+ for (uint32_t i = 0; i < 2; ++i) { //SIMD16 split to two SIMD8
+ sel.curr.quarterControl = (i == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
+ uint32_t msgid = 1;
+ for (uint32_t dim = 0; dim < dims; ++dim) { //the coords
+ GenRegister coord = sel.selReg(insn.getSrc(dim), insn.getCoordType());
+ sel.MOV(GenRegister::retype(msgs[msgid++], coord.type), GenRegister::Qn(coord, i));
}
- sel.pop();
+
+ while (msgid < 5) //fill fake coords
+ sel.MOV(msgs[msgid++], GenRegister::immud(0));
+
+ for (uint32_t j = 0; j < 4; ++j) { //the data
+ GenRegister data = sel.selReg(insn.getSrc(j + dims), insn.getSrcType());
+ sel.MOV(GenRegister::retype(msgs[msgid++], data.type), GenRegister::Qn(data, i));
+ }
+
+ sel.TYPED_WRITE(msgs, 9, bti, dims == 3);
+ }
+ sel.pop();
+ return true;
+ }
+
+ INLINE bool emitWithSimd8(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren, const GenRegister& header) const
+ {
+ using namespace ir;
+ GenRegister msgs[9]; // (header + U + V + W + LOD + 4)
+ msgs[0] = header;
+
+ const uint32_t dims = insn.getSrcNum() - 4;
+ uint32_t bti = insn.getImageIndex();
+ uint32_t msgid = 1;
+
+ for (uint32_t dim = 0; dim < dims; ++dim) { //the coords
+ GenRegister coord = sel.selReg(insn.getSrc(dim), insn.getCoordType());
+ msgs[msgid++] = coord;
+ }
+
+ while (msgid < 5) { //fill fake coords
+ GenRegister fake = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ sel.MOV(fake, GenRegister::immud(0));
+ msgs[msgid++] = fake;
}
+
+ for (uint32_t j = 0; j < 4; ++j) { //the data
+ GenRegister data = sel.selReg(insn.getSrc(j + dims), insn.getSrcType());
+ msgs[msgid++] = data;
+ }
+
+ sel.TYPED_WRITE(msgs, 9, bti, dims == 3);
return true;
}
+
DECL_CTOR(TypedWriteInstruction, 1, 1);
};
@@ -6170,8 +7349,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
uint8_t BTI = insn.getBti();
GenRegister tmp0, tmp1;
uint32_t srcNum = insn.getSrcNum();
- GenRegister dst = sel.selReg(insn.getDst(0), TYPE_S32);
- //GBE_ASSERT(srcNum);
+
uint32_t i = 0;
uint32_t totalSize = 0;
bool isContinue = false;
@@ -6192,14 +7370,14 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
i = 0;
GenRegister regs[8];
if (srcNum == 0) {
- sel.PRINTF(dst, BTI, tmp0, tmp1, regs, srcNum, num, isContinue, totalSize);
+ sel.PRINTF(BTI, tmp0, tmp1, regs, srcNum, num, isContinue, totalSize);
} else {
do {
uint32_t s = srcNum < 8 ? srcNum : 8;
for (uint32_t j = 0; j < s; j++) {
regs[j] = sel.selReg(insn.getSrc(i + j), insn.getType(i + j));
}
- sel.PRINTF(dst, BTI, tmp0, tmp1, regs, s, num, isContinue, totalSize);
+ sel.PRINTF(BTI, tmp0, tmp1, regs, s, num, isContinue, totalSize);
if (srcNum > 8) {
srcNum -= 8;
@@ -6257,6 +7435,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
} else {
// Update the PcIPs
const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
if(insn.getParent()->needEndif)
sel.setBlockIP(ip, dst.value());
@@ -6323,6 +7503,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
} else {
const LabelIndex next = bb.getNextBlock()->getLabelIndex();
// Update the PcIPs
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
if(insn.getParent()->needEndif)
sel.setBlockIP(ip, dst.value());
sel.block->endifOffset = -1;
@@ -6440,10 +7622,15 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
GenRegister localBarrier = GenRegister::ud8grf(sel.reg(FAMILY_DWORD));
/* Allocate registers for message sending
- * (read/write to shared local memory) */
+ * (read/write to shared local memory),
+ * only one data (ud/ul) is needed for thread communication,
+ * we will always use SIMD8 to do the read/write
+ */
vector<GenRegister> msg;
- for(uint32_t i = 0; i < 6; i++)
- msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
+ msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG))); //address
+ msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG))); //data
+ if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+ msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG))); //data
/* Insert a barrier to make sure all the var we are interested in
have been assigned the final value. */
@@ -6455,7 +7642,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
/* Perform workgroup op */
sel.WORKGROUP_OP(workGroupOp, dst, src, tmpData1,
- localThreadID, localThreadNUM, tmpData2, slmOff, msg, 6,
+ localThreadID, localThreadNUM, tmpData2, slmOff, msg,
localBarrier);
return true;
@@ -6713,20 +7900,78 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
using namespace ir;
uint32_t vec_size = insn.getVectorSize();
uint32_t simdWidth = sel.curr.execWidth;
+ const Type type = insn.getType();
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ uint32_t response_size = simdWidth * vec_size * typeSize / 32;
+ // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1
+ response_size = response_size ? response_size : 1;
+ uint32_t block_width = typeSize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
for (uint32_t i = 0; i < vec_size; ++i) {
- valuesVec.push_back(sel.selReg(insn.getDst(i), TYPE_U32));
- if(simdWidth == 16)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+ valuesVec.push_back(sel.selReg(insn.getDst(i), type));
+ if(simdWidth == 16 && typeSize == 4)
+ tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
}
- const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
- const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
- GenRegister *tmp = NULL;
- if(simdWidth == 16)
- tmp = &tmpVec[0];
- sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+ const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+ const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
+ const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
+ const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
+
+ // Make header
+ sel.push();
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.MOV(offsetx, coordx);
+ sel.MOV(offsety, coordy);
+ // Update block width and height
+ sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+ sel.pop();
+
+ if (simdWidth * typeSize < 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ // Now read the data
+ sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size);
+ sel.pop();
+ } else if (simdWidth * typeSize == 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size);
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(valuesVec[i], tmpVec[i]);
+
+ // Second half
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ // Now read the data
+ sel.curr.execWidth = 8;
+ sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size);
+
+ // Move the reg to fit vector rule.
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
+ sel.pop();
+ } else NOT_IMPLEMENTED;
+
+
return true;
}
DECL_CTOR(MediaBlockReadInstruction, 1, 1);
@@ -6739,17 +7984,85 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
{
using namespace ir;
uint32_t vec_size = insn.getVectorSize();
- const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
- const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+ const Type type = insn.getType();
+ uint32_t simdWidth = sel.curr.execWidth;
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ // ushort in simd8 will have half reg, but data lenght is still 1
+ uint32_t data_size = simdWidth * vec_size * typeSize / 32;
+ data_size = data_size? data_size : 1;
+ uint32_t block_width = typeSize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
- for(uint32_t i = 0; i < vec_size; i++)
- {
- valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32));
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
- }
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
- sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+ for (uint32_t i = 0; i < vec_size; ++i) {
+ valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+ if(simdWidth == 16 && typeSize == 4)
+ tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
+ else
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ }
+ const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+ const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
+ const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
+ const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
+
+ // Make header
+ sel.push();
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.MOV(offsetx, coordx);
+ sel.MOV(offsety, coordy);
+ // Update block width and height
+ sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+ sel.pop();
+
+ if (simdWidth * typeSize < 64) {
+ for (uint32_t i = 0; i < vec_size; ++i) {
+ sel.MOV(tmpVec[i], valuesVec[i]);
+ }
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ // Now write the data
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size);
+ sel.pop();
+ } else if (simdWidth * typeSize == 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(tmpVec[i], valuesVec[i]);
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+ // Second half
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ sel.curr.execWidth = 8;
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
+ // Now write the data
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+ // Move the reg to fit vector rule.
+ sel.pop();
+ } else NOT_IMPLEMENTED;
+
return true;
}
DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 14ac05f..a99b8a9 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -82,6 +82,10 @@ namespace gbe
bool isBranch(void) const;
/*! Is it a label instruction (i.e. change the implicit mask) */
bool isLabel(void) const;
+ /*! Is the src's gen register region is same as all dest regs' region */
+ bool sameAsDstRegion(uint32_t srcID);
+ /*! Is it a simple navtive instruction (i.e. will be one simple ISA) */
+ bool isNative(void) const;
/*! Get the destination register */
GenRegister &dst(uint32_t dstID) { return regs[dstID]; }
/*! Get the source register */
@@ -104,6 +108,7 @@ namespace gbe
uint16_t function:8;
/*! elemSize for byte scatters / gathers, elemNum for untyped msg, operand number for atomic */
uint16_t elem:8;
+ uint16_t splitSend:1;
};
struct {
/*! Number of sources in the tuple */
@@ -123,6 +128,7 @@ namespace gbe
uint16_t bti:8;
uint16_t msglen:5;
uint16_t is3DWrite:1;
+ uint16_t typedWriteSplitSend:1;
};
struct {
uint16_t rdbti:8;
@@ -154,8 +160,12 @@ namespace gbe
uint32_t printfBTI:8;
uint32_t continueFlag:8;
uint16_t printfSize;
+ uint16_t printfSplitSend:1;
};
- uint32_t workgroupOp;
+ struct {
+ uint16_t workgroupOp;
+ uint16_t splitSend:1;
+ }wgop;
} extra;
/*! Gen opcode */
uint8_t opcode;
@@ -209,6 +219,7 @@ namespace gbe
// Allocates (with a linear allocator) and owns SelectionInstruction
friend class Selection;
};
+ void outputSelectionInst(SelectionInstruction &insn);
/*! Instructions like sends require to make registers contiguous in GRF */
class SelectionVector : public NonCopyable, public intrusive_list_node
@@ -253,7 +264,6 @@ namespace gbe
void append(SelectionInstruction *insn);
/*! Append a new selection instruction at the beginning of the block */
void prepend(SelectionInstruction *insn);
- bool isLargeBlock;
ir::LabelIndex endifLabel;
int endifOffset;
bool hasBarrier;
@@ -314,6 +324,8 @@ namespace gbe
void optimize(void);
uint32_t opt_features;
+ /* Add insn ID for sel IR */
+ void addID(void);
const GenContext &getCtx();
/*! Use custom allocators */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index ccaf526..5d96e9e 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -38,6 +38,7 @@ DECL_SELECTION_IR(I64MUL, I64MULInstruction)
DECL_SELECTION_IR(I64DIV, I64DIVREMInstruction)
DECL_SELECTION_IR(I64REM, I64DIVREMInstruction)
DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
+DECL_SELECTION_IR(ATOMICA64, AtomicA64Instruction)
DECL_SELECTION_IR(MACH, BinaryInstruction)
DECL_SELECTION_IR(CMP, CompareInstruction)
DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
@@ -54,10 +55,16 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
DECL_SELECTION_IR(FENCE, FenceInstruction)
DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(UNTYPED_READA64, UntypedReadA64Instruction)
+DECL_SELECTION_IR(UNTYPED_WRITEA64, UntypedWriteA64Instruction)
DECL_SELECTION_IR(READ64, Read64Instruction)
DECL_SELECTION_IR(WRITE64, Write64Instruction)
+DECL_SELECTION_IR(READ64A64, Read64A64Instruction)
+DECL_SELECTION_IR(WRITE64A64, Write64A64Instruction)
DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
+DECL_SELECTION_IR(BYTE_GATHERA64, ByteGatherA64Instruction)
+DECL_SELECTION_IR(BYTE_SCATTERA64, ByteScatterA64Instruction)
DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
DECL_SELECTION_IR(PACK_BYTE, PackByteInstruction)
DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
@@ -100,3 +107,4 @@ DECL_SELECTION_IR(OBREAD, OBReadInstruction)
DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
DECL_SELECTION_IR(MBREAD, MBReadInstruction)
DECL_SELECTION_IR(MBWRITE, MBWriteInstruction)
+DECL_SELECTION_IR(BFREV, UnaryInstruction)
diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp b/backend/src/backend/gen_insn_selection_optimize.cpp
index b8aa776..512a5bd 100644
--- a/backend/src/backend/gen_insn_selection_optimize.cpp
+++ b/backend/src/backend/gen_insn_selection_optimize.cpp
@@ -18,11 +18,12 @@ namespace gbe
uint32_t elements = 0;
uint32_t elementSize = typeSize(reg.type);
uint32_t width = GenRegister::width_size(reg);
- assert(execWidth >= width);
+ // reg may be other insn's source, this insn's width don't force large then execWidth.
+ //assert(execWidth >= width);
uint32_t height = execWidth / width;
uint32_t vstride = GenRegister::vstride_size(reg);
uint32_t hstride = GenRegister::hstride_size(reg);
- uint32_t base = reg.subnr;
+ uint32_t base = reg.nr * GEN_REG_SIZE + reg.subnr;
for (uint32_t i = 0; i < height; ++i) {
uint32_t offsetInByte = base;
for (uint32_t j = 0; j < width; ++j) {
@@ -132,7 +133,7 @@ namespace gbe
for (ReplaceInfoMap::iterator pos = replaceInfoMap.begin(); pos != replaceInfoMap.end(); ++pos) {
ReplaceInfo* info = pos->second;
if (info->intermedia.reg() == var.reg()) { //intermedia is overwritten
- if (info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr) {
+ if (info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr && info->intermedia.nr == var.nr) {
// We need to check the if intermedia is fully overwritten, they may be in some prediction state.
if (CanBeReplaced(info, insn, var))
doReplacement(info);
@@ -161,7 +162,7 @@ namespace gbe
assert(insn.opcode == SEL_OP_MOV);
const GenRegister& src = insn.src(0);
const GenRegister& dst = insn.dst(0);
- if (src.type != dst.type || src.file != dst.file)
+ if (src.type != dst.type || src.file != dst.file || src.hstride != dst.hstride)
return;
if (liveout.find(dst.reg()) != liveout.end())
@@ -207,7 +208,8 @@ namespace gbe
if (info->insn.state.inversePredicate != insn.state.inversePredicate)
return false;
- if (info->intermedia.type == var.type && info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr) {
+ if (info->intermedia.type == var.type && info->intermedia.quarter == var.quarter &&
+ info->intermedia.subnr == var.subnr && info->intermedia.nr == var.nr) {
uint32_t elements = CalculateElements(var, insn.state.execWidth); //considering width, hstrid, vstrid and execWidth
if (info->elements == elements)
return true;
@@ -285,4 +287,14 @@ namespace gbe
//do global optimization
}
+
+ void Selection::addID()
+ {
+ uint32_t insnID = 0;
+ for (auto &block : *blockList)
+ for (auto &insn : block.insnList) {
+ insn.ID = insnID;
+ insnID += 2;
+ }
+ }
} /* namespace gbe */
diff --git a/backend/src/backend/gen_insn_selection_output.cpp b/backend/src/backend/gen_insn_selection_output.cpp
index ed26650..f23e8c8 100644
--- a/backend/src/backend/gen_insn_selection_output.cpp
+++ b/backend/src/backend/gen_insn_selection_output.cpp
@@ -45,7 +45,7 @@ namespace gbe
cout << "(abs)";
cout << "%" << reg.value.reg;
if (reg.subphysical)
- cout << "." << reg.subnr;
+ cout << "." << reg.subnr + reg.nr * GEN_REG_SIZE;
if (dst)
cout << "<" << GenRegister::hstride_size(reg) << ">";
@@ -96,77 +96,91 @@ namespace gbe
}
#define OP_NAME_LENGTH 512
- void outputSelectionIR(GenContext &ctx, Selection* sel)
- {
- cout << "SELECTION IR begin:" << endl;
- cout << "WARNING: not completed yet, welcome for the FIX!" << endl;
- for (SelectionBlock &block : *sel->blockList) {
- for (SelectionInstruction &insn : block.insnList) {
- char opname[OP_NAME_LENGTH];
- if (insn.isLabel()) {
- cout << " L" << insn.index << ":" << endl;
- continue;
- } else {
- switch (insn.opcode) {
- #define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: sprintf(opname, "%s", #OP); break;
- #include "backend/gen_insn_selection.hxx"
- #undef DECL_SELECTION_IR
- }
- }
+ void outputSelectionInst(SelectionInstruction &insn) {
+ cout<<"["<<insn.ID<<"]";
+ if (insn.state.predicate != GEN_PREDICATE_NONE) {
+ if (insn.state.physicalFlag == 0)
+ cout << "(f" << insn.state.flagIndex << ")\t";
+ else
+ cout << "(f" << insn.state.flag << "." << insn.state.subFlag << ")\t";
+ }
+ else
+ cout << " \t";
- if (insn.opcode == SEL_OP_CMP) {
- switch (insn.extra.function) {
- case GEN_CONDITIONAL_LE:
- strcat(opname, ".le");
- break;
- case GEN_CONDITIONAL_L:
- strcat(opname, ".l");
- break;
- case GEN_CONDITIONAL_GE:
- strcat(opname, ".ge");
- break;
- case GEN_CONDITIONAL_G:
- strcat(opname, ".g");
- break;
- case GEN_CONDITIONAL_EQ:
- strcat(opname, ".eq");
- break;
- case GEN_CONDITIONAL_NEQ:
- strcat(opname, ".neq");
- break;
- }
- }
+ char opname[OP_NAME_LENGTH];
+ if (insn.isLabel()) {
+ cout << " L" << insn.index << ":" << endl;
+ return;
+ } else {
+ switch (insn.opcode) {
+ #define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: sprintf(opname, "%s", #OP); break;
+ #include "backend/gen_insn_selection.hxx"
+ #undef DECL_SELECTION_IR
+ }
+ }
- int n = strlen(opname);
- if(n >= OP_NAME_LENGTH - 20) {
- cout << "opname too long: " << opname << endl;
- return;
- }
+ if (insn.opcode == SEL_OP_CMP) {
+ switch (insn.extra.function) {
+ case GEN_CONDITIONAL_LE:
+ strcat(opname, ".le");
+ break;
+ case GEN_CONDITIONAL_L:
+ strcat(opname, ".l");
+ break;
+ case GEN_CONDITIONAL_GE:
+ strcat(opname, ".ge");
+ break;
+ case GEN_CONDITIONAL_G:
+ strcat(opname, ".g");
+ break;
+ case GEN_CONDITIONAL_EQ:
+ strcat(opname, ".eq");
+ break;
+ case GEN_CONDITIONAL_NEQ:
+ strcat(opname, ".neq");
+ break;
+ }
+ }
- sprintf(&opname[n], "(%d)", insn.state.execWidth);
- cout << " " << left << setw(20) << opname;
+ int n = strlen(opname);
+ if(n >= OP_NAME_LENGTH - 20) {
+ cout << "opname too long: " << opname << endl;
+ return;
+ }
- for (int i = 0; i < insn.dstNum; ++i)
- {
- GenRegister dst = insn.dst(i);
- outputGenReg(dst, true);
- cout << "\t";
- }
+ sprintf(&opname[n], "(%d)", insn.state.execWidth);
+ cout << left << setw(20) << opname;
- cout << ":\t";
+ for (int i = 0; i < insn.dstNum; ++i)
+ {
+ GenRegister dst = insn.dst(i);
+ outputGenReg(dst, true);
+ cout << "\t";
+ }
- for (int i = 0; i < insn.srcNum; ++i)
- {
- GenRegister src = insn.src(i);
- outputGenReg(src, false);
- cout << "\t";
- }
+ cout << ":\t";
- cout << endl;
+ for (int i = 0; i < insn.srcNum; ++i)
+ {
+ GenRegister src = insn.src(i);
+ outputGenReg(src, false);
+ cout << "\t";
+ }
+
+ cout << endl;
+ }
+
+ void outputSelectionIR(GenContext &ctx, Selection* sel, const char* KernelName)
+ {
+ cout << KernelName <<"'s SELECTION IR begin:" << endl;
+ cout << "WARNING: not completed yet, welcome for the FIX!" << endl;
+ for (SelectionBlock &block : *sel->blockList) {
+ for (SelectionInstruction &insn : block.insnList) {
+ outputSelectionInst(insn);
}
cout << endl;
}
- cout << "SELECTION IR end." << endl << endl;
+ cout <<KernelName << "'s SELECTION IR end." << endl << endl;
}
}
diff --git a/backend/src/backend/gen_insn_selection_output.hpp b/backend/src/backend/gen_insn_selection_output.hpp
index dd372dc..e1c72af 100644
--- a/backend/src/backend/gen_insn_selection_output.hpp
+++ b/backend/src/backend/gen_insn_selection_output.hpp
@@ -6,7 +6,7 @@ namespace gbe
class Selection; // Pre ISA code
class GenContext; // Handle compilation for Gen
- void outputSelectionIR(GenContext &ctx, Selection* sel);
+ void outputSelectionIR(GenContext &ctx, Selection* sel, const char* KernelName);
} /* namespace gbe */
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 4ef82d1..073ede6 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -212,6 +212,7 @@ namespace gbe {
kernel = ctx->compileKernel();
if (kernel != NULL) {
GBE_ASSERT(ctx->getErrCode() == NO_ERROR);
+ kernel->setOclVersion(unit.getOclVersion());
break;
}
simdFn->getImageSet()->clearInfo();
@@ -351,8 +352,10 @@ namespace gbe {
#endif
// if load 32 bit spir binary, the triple should be spir-unknown-unknown.
llvm::Triple triple(module->getTargetTriple());
- if(triple.getArchName() == "spir" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
+ if (triple.getArchName() == "spir" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
module->setTargetTriple("spir");
+ } else if (triple.getArchName() == "spir64" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
+ module->setTargetTriple("spir64");
}
releaseLLVMContextLock();
if(module == NULL){
@@ -524,7 +527,7 @@ namespace gbe {
size_t stringSize,
char *err,
size_t *errSize,
- const char * options)
+ const char * options)
{
#ifdef GBE_COMPILER_AVAILABLE
using namespace gbe;
@@ -544,7 +547,9 @@ namespace gbe {
if (strstr(options, "-cl-fast-relaxed-math") != NULL)
fast_relaxed_math = 1;
- char *options_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+ char *options_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+ if (options_str == NULL)
+ return;
memcpy(options_str, options, strlen(options) + 1);
std::string optionStr(options_str);
while (end != std::string::npos) {
@@ -565,11 +570,11 @@ namespace gbe {
GenProgram* p = (GenProgram*) program;
p->fast_relaxed_math = fast_relaxed_math;
if (!dumpASMFileName.empty()) {
- p->asm_file_name = dumpASMFileName.c_str();
- FILE *asmDumpStream = fopen(dumpASMFileName.c_str(), "w");
- if (asmDumpStream)
- fclose(asmDumpStream);
- }
+ p->asm_file_name = dumpASMFileName.c_str();
+ FILE *asmDumpStream = fopen(dumpASMFileName.c_str(), "w");
+ if (asmDumpStream)
+ fclose(asmDumpStream);
+ }
// Try to compile the program
acquireLLVMContextLock();
llvm::Module* module = (llvm::Module*)p->module;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 4451efb..d88b316 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -86,14 +86,18 @@ namespace gbe
INLINE void getRegAttrib(ir::Register reg, uint32_t ®Size, ir::RegisterFamily *regFamily = NULL) const {
// Note that byte vector registers use two bytes per byte (and can be
// interleaved)
- static const size_t familyVectorSize[] = {2,2,2,4,8};
- static const size_t familyScalarSize[] = {2,2,2,4,8};
+ static const size_t familyVectorSize[] = {2,2,2,4,8,16,32};
+ static const size_t familyScalarSize[] = {2,2,2,4,8,16,32};
using namespace ir;
const bool isScalar = ctx.sel->isScalarReg(reg);
const RegisterData regData = ctx.sel->getRegisterData(reg);
const RegisterFamily family = regData.family;
- const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
- regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+ if (family == ir::FAMILY_REG)
+ regSize = 32;
+ else {
+ const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
+ regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+ }
if (regFamily != NULL)
*regFamily = family;
}
@@ -148,7 +152,6 @@ namespace gbe
vector<GenRegInterval> intervals;
/*! All the boolean register intervals on the corresponding BB*/
typedef map<ir::Register, GenRegInterval> RegIntervalMap;
- set<SelectionBlock *> flag0ReservedBlocks;
map<SelectionBlock *, RegIntervalMap *> boolIntervalsMap;
/*! Intervals sorting based on starting point positions */
vector<GenRegInterval*> starting;
@@ -425,6 +428,12 @@ namespace gbe
#define GET_FLAG_REG(insn) GenRegister::uwxgrf(IS_SCALAR_FLAG(insn) ? 1 : 8,\
ir::Register(insn.state.flagIndex));
#define IS_TEMP_FLAG(insn) (insn.state.flag == 0 && insn.state.subFlag == 1)
+ #define NEED_DST_GRF_TYPE_FIX(ty) \
+ (ty == GEN_TYPE_F || \
+ ty == GEN_TYPE_HF || \
+ ty == GEN_TYPE_DF || \
+ ty == GEN_TYPE_UL || \
+ ty == GEN_TYPE_L)
// Flag is a virtual flag, this function is to validate the virtual flag
// to a physical flag. It is used to validate both temporary flag and the
// non-temporary flag registers.
@@ -497,7 +506,7 @@ namespace gbe
map<ir::Register, uint32_t> allocatedFlags;
map<const GenRegInterval*, uint32_t> allocatedFlagIntervals;
- const uint32_t flagNum = flag0ReservedBlocks.contains(&block) ? 2 : 3;
+ const uint32_t flagNum = 3;
uint32_t freeFlags[] = {2, 3, 0};
uint32_t freeNum = flagNum;
if (boolIntervalsMap.find(&block) == boolIntervalsMap.end())
@@ -590,6 +599,14 @@ namespace gbe
// is called a "conditional modifier"). The other instructions just read
// it
if (insn.state.physicalFlag == 0) {
+ // SEL.bool instruction, the dst register should be stored in GRF
+ // the pred flag is used by flag register
+ if (insn.opcode == SEL_OP_SEL) {
+ ir::Register dst = insn.dst(0).reg();
+ if (ctx.sel->getRegisterFamily(dst) == ir::FAMILY_BOOL &&
+ allocatedFlags.find(dst) != allocatedFlags.end())
+ allocatedFlags.erase(dst);
+ }
auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
if (it != allocatedFlags.end()) {
insn.state.physicalFlag = 1;
@@ -629,19 +646,28 @@ namespace gbe
if (insn.state.predicate != GEN_PREDICATE_NONE)
validateFlag(selection, insn);
}
- // This is a CMP for a pure flag booleans, we don't need to write result to
- // the grf. And latter, we will not allocate grf for it.
if (insn.opcode == SEL_OP_CMP &&
(flagBooleans.contains(insn.dst(0).reg()) ||
GenRegister::isNull(insn.dst(0)))) {
+ // This is a CMP for a pure flag booleans, we don't need to write result to
+ // the grf. And latter, we will not allocate grf for it.
// set a temporary register to avoid switch in this block.
bool isSrc = false;
bool needMov = false;
ir::Type ir_type = ir::TYPE_FLOAT;
- if (insn.src(0).isint64())
- ir_type = ir::TYPE_U64;
+
+ // below (src : dst) type mapping for 'cmp'
+ // is allowed by hardware
+ // B,W,D,F : F
+ // HF : HF
+ // DF : DF
+ // Q : Q
+ if (NEED_DST_GRF_TYPE_FIX(insn.src(0).type))
+ ir_type = getIRType(insn.src(0).type);
+
this->replaceReg(selection, &insn, 0, isSrc, ir_type, needMov);
}
+
// If the instruction requires to generate (CMP for long/int/float..)
// the flag value to the register, and it's not a pure flag boolean,
// we need to use SEL instruction to generate the flag value to the UW8
@@ -1215,11 +1241,9 @@ namespace gbe
// Update the intervals of each used register. Note that we do not
// register allocate R0, so we skip all sub-registers in r0
RegIntervalMap *boolsMap = new RegIntervalMap;
- if (block.isLargeBlock)
- flag0ReservedBlocks.insert(&block);
for (auto &insn : block.insnList) {
const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
- insn.ID = insnID;
+ assert(insnID == (int32_t)insn.ID);
bool is3SrcOp = insn.opcode == SEL_OP_MAD;
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const GenRegister &selReg = insn.src(srcID);
@@ -1244,8 +1268,14 @@ namespace gbe
if (this->intervals[reg].conflictReg == 0 ||
this->intervals[reg].conflictReg > conflictReg)
this->intervals[reg].conflictReg = conflictReg;
- this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
- this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+ int insnsrcID = insnID;
+ // If instruction is simple, src and dst can be reused and they will have different IDs
+ // insn may be split in the encoder, if register region are not same, can't be reused.
+ // Because hard to check split or not here, so only check register regio.
+ if (insn.isNative() && insn.sameAsDstRegion(srcID))
+ insnsrcID -= 1;
+ this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnsrcID);
+ this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnsrcID);
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const GenRegister &selReg = insn.dst(dstID);
@@ -1289,8 +1319,6 @@ namespace gbe
// is out-of the if/endif region, so we have to borrow the f0
// to get correct bits for all channels.
boolsMap->find(reg)->second.minID = 0;
- if (flag0ReservedBlocks.contains(&block))
- flag0ReservedBlocks.erase(&block);
}
} else {
// Make sure that instruction selection stage didn't use physiacl flags incorrectly.
@@ -1299,11 +1327,10 @@ namespace gbe
insn.opcode == SEL_OP_JMPI ||
insn.state.predicate == GEN_PREDICATE_NONE ||
(block.hasBarrier && insn.opcode == SEL_OP_MOV) ||
- (insn.state.flag == 0 && insn.state.subFlag == 1) ||
- (block.removeSimpleIfEndif && insn.state.flag == 0 && insn.state.subFlag == 0) ));
+ (insn.state.flag == 0 && insn.state.subFlag == 1) ));
}
lastID = insnID;
- insnID++;
+ insnID += 2;
}
// All registers alive at the begining of the block must update their intervals.
@@ -1472,7 +1499,7 @@ do { \
}
GBE_ASSERT(RA.contains(reg.reg()) != false);
const uint32_t grfOffset = RA.find(reg.reg())->second;
- const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+ const uint32_t suboffset = reg.subphysical ? reg.nr * GEN_REG_SIZE + reg.subnr : 0;
const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
if (reg.quarter != 0)
return GenRegister::Qn(dst, reg.quarter);
@@ -1522,7 +1549,8 @@ do { \
const ir::FunctionArgument &arg = this->opaque->ctx.getFunction().getArg(subType);
if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
arg.type == ir::FunctionArgument::LOCAL_POINTER ||
- arg.type == ir::FunctionArgument::CONSTANT_POINTER)
+ arg.type == ir::FunctionArgument::CONSTANT_POINTER||
+ arg.type == ir::FunctionArgument::PIPE)
regSize = this->opaque->ctx.getPointerSize();
else
regSize = arg.size;
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index a8eb2e4..6c73f5e 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -169,6 +169,12 @@ namespace gbe
NOT_IMPLEMENTED;
}
}
+ void useVirtualFlag(ir::Register flag, unsigned pred) {
+ modFlag = 0;
+ physicalFlag = 0;
+ flagIndex = flag;
+ predicate = pred;
+ }
void useFlag(int nr, int subnr) {
flag = nr;
subFlag = subnr;
@@ -268,6 +274,10 @@ namespace gbe
static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
GenRegister r = reg;
+ if(subnr >= 32){
+ nr += subnr / 32;
+ subnr = subnr % 32;
+ }
r.nr += nr;
r.subnr += subnr;
r.subphysical = 1;
@@ -283,6 +293,14 @@ namespace gbe
return r;
}
+ INLINE bool isSameRegion(GenRegister reg) const {
+ return reg.file == file &&
+ typeSize(reg.type) == typeSize(type) &&
+ reg.vstride == vstride &&
+ reg.width == width &&
+ reg.hstride == hstride;
+ }
+
static INLINE uint32_t grfOffset(GenRegister reg) {
return reg.nr * GEN_REG_SIZE + reg.subnr;
}
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 2224880..85d0aa9 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -89,7 +89,8 @@ namespace gbe {
Kernel::Kernel(const std::string &name) :
name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false),
slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), printfSet(NULL),
- profilingInfo(NULL) {}
+ profilingInfo(NULL), useDeviceEnqueue(false) {}
+
Kernel::~Kernel(void) {
if(ctx) GBE_DELETE(ctx);
if(samplerSet) GBE_DELETE(samplerSet);
@@ -106,11 +107,14 @@ namespace gbe {
return it->offset; // we found it!
}
- Program::Program(uint32_t fast_relaxed_math) : fast_relaxed_math(fast_relaxed_math), constantSet(NULL) {}
+ Program::Program(uint32_t fast_relaxed_math) : fast_relaxed_math(fast_relaxed_math),
+ constantSet(NULL),
+ relocTable(NULL) {}
Program::~Program(void) {
for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it)
GBE_DELETE(it->second);
if (constantSet) delete constantSet;
+ if (relocTable) delete relocTable;
}
#ifdef GBE_COMPILER_AVAILABLE
@@ -119,7 +123,10 @@ namespace gbe {
IVAR(OCL_PROFILING_LOG, 0, 0, 1); // Int for different profiling types.
BVAR(OCL_OUTPUT_BUILD_LOG, false);
- bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
+ bool Program::buildFromLLVMFile(const char *fileName,
+ const void* module,
+ std::string &error,
+ int optLevel) {
ir::Unit *unit = new ir::Unit();
llvm::Module * cloned_module = NULL;
bool ret = false;
@@ -174,6 +181,8 @@ namespace gbe {
bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
constantSet = new ir::ConstantSet(unit.getConstantSet());
+ relocTable = new ir::RelocTable(unit.getRelocTable());
+ blockFuncs = unit.blockFuncs;
const auto &set = unit.getFunctionSet();
const uint32_t kernelNum = set.size();
if (OCL_OUTPUT_GEN_IR) std::cout << unit;
@@ -212,6 +221,7 @@ namespace gbe {
uint32_t ret_size = 0;
uint32_t ker_num = kernels.size();
uint32_t has_constset = 0;
+ uint32_t has_relocTable = 0;
OUT_UPDATE_SZ(magic_begin);
@@ -227,6 +237,18 @@ namespace gbe {
OUT_UPDATE_SZ(has_constset);
}
+ if(relocTable) {
+ has_relocTable = 1;
+ OUT_UPDATE_SZ(has_relocTable);
+ uint32_t sz = relocTable->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ } else {
+ OUT_UPDATE_SZ(has_relocTable);
+ }
+
OUT_UPDATE_SZ(ker_num);
for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it) {
uint32_t sz = it->second->serializeToBin(outs);
@@ -247,6 +269,7 @@ namespace gbe {
int has_constset = 0;
uint32_t ker_num;
uint32_t magic;
+ uint32_t has_relocTable = 0;
IN_UPDATE_SZ(magic);
if (magic != magic_begin)
@@ -263,6 +286,17 @@ namespace gbe {
total_size += sz;
}
+ IN_UPDATE_SZ(has_relocTable);
+ if(has_relocTable) {
+ relocTable = new ir::RelocTable;
+ uint32_t sz = relocTable->deserializeFromBin(ins);
+
+ if (sz == 0)
+ return 0;
+
+ total_size += sz;
+ }
+
IN_UPDATE_SZ(ker_num);
for (uint32_t i = 0; i < ker_num; i++) {
@@ -303,6 +337,8 @@ namespace gbe {
outs.write(name.c_str(), name.size());
ret_size += sizeof(char)*name.size();
+ OUT_UPDATE_SZ(oclVersion);
+
OUT_UPDATE_SZ(argNum);
for (i = 0; i < argNum; i++) {
KernelArgument& arg = args[i];
@@ -415,6 +451,8 @@ namespace gbe {
name = c_name;
delete[] c_name;
+ IN_UPDATE_SZ(oclVersion);
+
IN_UPDATE_SZ(argNum);
args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, argNum);
for (uint32_t i = 0; i < argNum; i++) {
@@ -616,7 +654,7 @@ namespace gbe {
#ifdef GBE_COMPILER_AVAILABLE
static bool buildModuleFromSource(const char *source, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
std::string dumpLLVMFileName, std::string dumpSPIRBinaryName, std::vector<std::string>& options, size_t stringSize, char *err,
- size_t *errSize) {
+ size_t *errSize, uint32_t oclVersion) {
// Arguments to pass to the clang frontend
vector<const char *> args;
bool bFastMath = false;
@@ -626,6 +664,17 @@ namespace gbe {
}
args.push_back("-cl-kernel-arg-info");
+ // The ParseCommandLineOptions used for mllvm args can not be used with multithread
+ // and GVN now have a 100 inst limit on block scan. Now only pass a bigger limit
+ // for each context only once, this can also fix multithread bug.
+#if LLVM_VERSION_MINOR >= 8
+ static bool ifsetllvm = false;
+ if(!ifsetllvm) {
+ args.push_back("-mllvm");
+ args.push_back("-memdep-block-scan-limit=200");
+ ifsetllvm = true;
+ }
+#endif
#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
#endif
@@ -643,7 +692,11 @@ namespace gbe {
args.push_back("-x");
args.push_back("cl");
args.push_back("-triple");
- args.push_back("spir");
+ if (oclVersion >= 200) {
+ args.push_back("spir64");
+ args.push_back("-fblocks");
+ } else
+ args.push_back("spir");
#endif /* LLVM_VERSION_MINOR <= 2 */
args.push_back("stringInput.cl");
args.push_back("-ffp-contract=on");
@@ -691,7 +744,18 @@ namespace gbe {
clang::LangOptions & lang_opts = Clang.getLangOpts();
lang_opts.OpenCL = 1;
- GBE_ASSERT(Clang.getFrontendOpts().LLVMArgs.empty() && "We do not have llvm args now");
+ //llvm flags need command line parsing to take effect
+ if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
+ unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
+ const char **Args = new const char*[NumArgs + 2];
+ Args[0] = "clang (LLVM option parsing)";
+ for (unsigned i = 0; i != NumArgs; ++i){
+ Args[i + 1] = Clang.getFrontendOpts().LLVMArgs[i].c_str();
+ }
+ Args[NumArgs + 1] = 0;
+ llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
+ delete [] Args;
+ }
// Create an action and make the compiler instance carry it out
std::unique_ptr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction(llvm_ctx));
@@ -774,6 +838,7 @@ namespace gbe {
SVAR(OCL_PCH_PATH, OCL_PCH_OBJECT);
+ SVAR(OCL_PCH_20_PATH, OCL_PCH_OBJECT_20);
SVAR(OCL_HEADER_FILE_DIR, OCL_HEADER_DIR);
BVAR(OCL_OUTPUT_KERNEL_SOURCE, false);
@@ -787,10 +852,9 @@ namespace gbe {
int& optLevel,
size_t stringSize,
char *err,
- size_t *errSize)
+ size_t *errSize,
+ uint32_t &oclVersion)
{
- std::string dirs = OCL_PCH_PATH;
- std::istringstream idirs(dirs);
std::string pchFileName;
bool findPCH = false;
#if defined(__ANDROID__)
@@ -803,8 +867,6 @@ namespace gbe {
std::string hdirs = OCL_HEADER_FILE_DIR;
if(hdirs == "")
hdirs = OCL_HEADER_DIR;
- if(dirs == "")
- dirs = OCL_PCH_OBJECT;
std::istringstream hidirs(hdirs);
std::string headerFilePath;
bool findOcl = false;
@@ -816,6 +878,7 @@ namespace gbe {
break;
}
}
+ (void) findOcl;
assert(findOcl);
if (OCL_OUTPUT_KERNEL_SOURCE) {
if(options) {
@@ -831,6 +894,8 @@ namespace gbe {
if (options) {
char *c_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+ if (c_str == NULL)
+ return false;
memcpy(c_str, options, strlen(options) + 1);
std::string optionStr(c_str);
const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
@@ -912,11 +977,16 @@ EXTEND_QUOTE:
if(str.find("-cl-std=") != std::string::npos) {
useDefaultCLCVersion = false;
- if (str == "-cl-std=CL1.1")
+ if (str == "-cl-std=CL1.1") {
clOpt.push_back("-D__OPENCL_C_VERSION__=110");
- else if (str == "-cl-std=CL1.2")
+ oclVersion = 110;
+ } else if (str == "-cl-std=CL1.2") {
clOpt.push_back("-D__OPENCL_C_VERSION__=120");
- else {
+ oclVersion = 120;
+ } else if (str == "-cl-std=CL2.0") {
+ clOpt.push_back("-D__OPENCL_C_VERSION__=200");
+ oclVersion = 200;
+ } else {
if (err && stringSize > 0 && errSize)
*errSize = snprintf(err, stringSize, "Invalid build option: %s\n", str.c_str());
return false;
@@ -952,8 +1022,15 @@ EXTEND_QUOTE:
}
if (useDefaultCLCVersion) {
+#ifdef ENABLE_OPENCL_20
+ clOpt.push_back("-D__OPENCL_C_VERSION__=200");
+ clOpt.push_back("-cl-std=CL2.0");
+ oclVersion = 200;
+#else
clOpt.push_back("-D__OPENCL_C_VERSION__=120");
clOpt.push_back("-cl-std=CL1.2");
+ oclVersion = 120;
+#endif
}
//for clCompilerProgram usage.
if(temp_header_path){
@@ -961,6 +1038,14 @@ EXTEND_QUOTE:
clOpt.push_back(temp_header_path);
}
+ std::string dirs = OCL_PCH_PATH;
+ if(oclVersion >= 200)
+ dirs = OCL_PCH_20_PATH;
+ if(dirs == "") {
+ dirs = oclVersion >= 200 ? OCL_PCH_OBJECT_20 : OCL_PCH_OBJECT;
+ }
+ std::istringstream idirs(dirs);
+
while (getline(idirs, pchFileName, ':')) {
if(access(pchFileName.c_str(), R_OK) == 0) {
findPCH = true;
@@ -991,10 +1076,11 @@ EXTEND_QUOTE:
std::vector<std::string> clOpt;
std::string dumpLLVMFileName, dumpASMFileName;
std::string dumpSPIRBinaryName;
+ uint32_t oclVersion = 0;
if (!processSourceAndOption(source, options, NULL, clOpt,
dumpLLVMFileName, dumpASMFileName, dumpSPIRBinaryName,
optLevel,
- stringSize, err, errSize))
+ stringSize, err, errSize, oclVersion))
return NULL;
gbe_program p;
@@ -1006,7 +1092,7 @@ EXTEND_QUOTE:
llvm_mutex.lock();
if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, dumpSPIRBinaryName, clOpt,
- stringSize, err, errSize)) {
+ stringSize, err, errSize, oclVersion)) {
// Now build the program from llvm
size_t clangErrSize = 0;
if (err != NULL && *errSize != 0) {
@@ -1053,9 +1139,10 @@ EXTEND_QUOTE:
std::vector<std::string> clOpt;
std::string dumpLLVMFileName, dumpASMFileName;
std::string dumpSPIRBinaryName;
+ uint32_t oclVersion = 0;
if (!processSourceAndOption(source, options, temp_header_path, clOpt,
dumpLLVMFileName, dumpASMFileName, dumpSPIRBinaryName,
- optLevel, stringSize, err, errSize))
+ optLevel, stringSize, err, errSize, oclVersion))
return NULL;
gbe_program p;
@@ -1070,7 +1157,7 @@ EXTEND_QUOTE:
#endif
if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, dumpSPIRBinaryName, clOpt,
- stringSize, err, errSize)) {
+ stringSize, err, errSize, oclVersion)) {
// Now build the program from llvm
if (err != NULL) {
GBE_ASSERT(errSize != NULL);
@@ -1164,12 +1251,30 @@ EXTEND_QUOTE:
program->getGlobalConstantData(mem);
}
+ static size_t programGetGlobalRelocCount(gbe_program gbeProgram) {
+ if (gbeProgram == NULL) return 0;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ return program->getGlobalRelocCount();
+ }
+
+ static void programGetGlobalRelocTable(gbe_program gbeProgram, char *mem) {
+ if (gbeProgram == NULL) return;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ program->getGlobalRelocTable(mem);
+ }
+
static uint32_t programGetKernelNum(gbe_program gbeProgram) {
if (gbeProgram == NULL) return 0;
const gbe::Program *program = (const gbe::Program*) gbeProgram;
return program->getKernelNum();
}
+ const static char* programGetDeviceEnqueueKernelName(gbe_program gbeProgram, uint32_t index) {
+ if (gbeProgram == NULL) return 0;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ return program->getDeviceEnqueueKernelName(index);
+ }
+
static gbe_kernel programGetKernelByName(gbe_program gbeProgram, const char *name) {
if (gbeProgram == NULL) return NULL;
const gbe::Program *program = (gbe::Program*) gbeProgram;
@@ -1228,6 +1333,8 @@ EXTEND_QUOTE:
return (void *)(info->typeQual.c_str());
case GBE_GET_ARG_INFO_NAME:
return (void *)(info->argName.c_str());
+ case GBE_GET_ARG_INFO_TYPESIZE:
+ return (void *)((size_t)info->typeSize);
default:
assert(0);
}
@@ -1333,6 +1440,12 @@ EXTEND_QUOTE:
return ps->getPrintfNum();
}
+ static uint32_t kernelUseDeviceEnqueue(gbe_kernel gbeKernel) {
+ if (gbeKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ return kernel->getUseDeviceEnqueue();
+ }
+
static void* kernelDupPrintfSet(gbe_kernel gbeKernel) {
if (gbeKernel == NULL) return NULL;
const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -1376,6 +1489,12 @@ EXTEND_QUOTE:
kernel->getImageData(images);
}
+ static uint32_t kernelGetOclVersion(gbe_kernel gbeKernel) {
+ if (gbeKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ return kernel->getOclVersion();
+ }
+
static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
return 0u;
}
@@ -1405,11 +1524,14 @@ GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NU
GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table = NULL;
GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL;
GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_kernel_cb *gbe_program_get_kernel = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_device_enqueue_kernel_name_cb *gbe_program_get_device_enqueue_kernel_name = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_name_cb *gbe_kernel_get_name = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_code_cb *gbe_kernel_get_code = NULL;
@@ -1433,6 +1555,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data =
GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_ocl_version_cb *gbe_kernel_get_ocl_version = NULL;
GBE_EXPORT_SYMBOL gbe_output_profiling_cb *gbe_output_profiling = NULL;
GBE_EXPORT_SYMBOL gbe_dup_profiling_cb *gbe_dup_profiling = NULL;
GBE_EXPORT_SYMBOL gbe_get_profiling_bti_cb *gbe_get_profiling_bti = NULL;
@@ -1441,6 +1564,7 @@ GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_use_device_enqueue_cb *gbe_kernel_use_device_enqueue = NULL;
#ifdef GBE_COMPILER_AVAILABLE
namespace gbe
@@ -1455,9 +1579,12 @@ namespace gbe
gbe_program_check_opt = gbe::programCheckOption;
gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+ gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+ gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
gbe_program_delete = gbe::programDelete;
gbe_program_get_kernel_num = gbe::programGetKernelNum;
+ gbe_program_get_device_enqueue_kernel_name = gbe::programGetDeviceEnqueueKernelName;
gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
gbe_program_get_kernel = gbe::programGetKernel;
gbe_kernel_get_name = gbe::kernelGetName;
@@ -1483,6 +1610,7 @@ namespace gbe
gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
gbe_kernel_get_image_size = gbe::kernelGetImageSize;
gbe_kernel_get_image_data = gbe::kernelGetImageData;
+ gbe_kernel_get_ocl_version = gbe::kernelGetOclVersion;
gbe_get_profiling_bti = gbe::kernelGetProfilingBTI;
gbe_get_printf_num = gbe::kernelGetPrintfNum;
gbe_dup_profiling = gbe::kernelDupProfiling;
@@ -1491,6 +1619,7 @@ namespace gbe
gbe_dup_printfset = gbe::kernelDupPrintfSet;
gbe_release_printf_info = gbe::kernelReleasePrintfSet;
gbe_output_printf = gbe::kernelOutputPrintf;
+ gbe_kernel_use_device_enqueue = gbe::kernelUseDeviceEnqueue;
genSetupCallBacks();
}
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index db770a6..e601c97 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -55,6 +55,7 @@ enum gbe_arg_type {
GBE_ARG_LOCAL_PTR = 3, // __local
GBE_ARG_IMAGE = 4, // image2d_t, image3d_t
GBE_ARG_SAMPLER = 5, // sampler_t
+ GBE_ARG_PIPE = 6, // pipe
GBE_ARG_INVALID = 0xffffffff
};
@@ -65,6 +66,7 @@ enum gbe_get_arg_info_value {
GBE_GET_ARG_INFO_TYPE = 2,
GBE_GET_ARG_INFO_TYPEQUAL = 3,
GBE_GET_ARG_INFO_NAME = 4,
+ GBE_GET_ARG_INFO_TYPESIZE = 5,
GBE_GET_ARG_INFO_INVALID = 0xffffffff
};
@@ -86,6 +88,9 @@ enum gbe_curbe_type {
GBE_CURBE_LOCAL_SIZE_X,
GBE_CURBE_LOCAL_SIZE_Y,
GBE_CURBE_LOCAL_SIZE_Z,
+ GBE_CURBE_ENQUEUED_LOCAL_SIZE_X,
+ GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y,
+ GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z,
GBE_CURBE_GLOBAL_SIZE_X,
GBE_CURBE_GLOBAL_SIZE_Y,
GBE_CURBE_GLOBAL_SIZE_Z,
@@ -109,6 +114,9 @@ enum gbe_curbe_type {
GBE_CURBE_PROFILING_TIMESTAMP3,
GBE_CURBE_PROFILING_TIMESTAMP4,
GBE_CURBE_THREAD_ID,
+ GBE_CURBE_CONSTANT_ADDRSPACE,
+ GBE_CURBE_STACK_SIZE,
+ GBE_CURBE_ENQUEUE_BUF_POINTER,
GBE_GEN_REG,
};
@@ -257,6 +265,11 @@ extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_
typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
+typedef size_t (gbe_program_get_global_reloc_count_cb)(gbe_program gbeProgram);
+extern gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count;
+
+typedef void (gbe_program_get_global_reloc_table_cb)(gbe_program gbeProgram, char *mem);
+extern gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table;
/*! Get the size of defined samplers */
typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
@@ -289,6 +302,9 @@ extern gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name;
typedef gbe_kernel (gbe_program_get_kernel_cb)(gbe_program, uint32_t ID);
extern gbe_program_get_kernel_cb *gbe_program_get_kernel;
+typedef const char* (gbe_program_get_device_enqueue_kernel_name_cb)(gbe_program, uint32_t ID);
+extern gbe_program_get_device_enqueue_kernel_name_cb *gbe_program_get_device_enqueue_kernel_name;
+
/*! Get the kernel name */
typedef const char *(gbe_kernel_get_name_cb)(gbe_kernel);
extern gbe_kernel_get_name_cb *gbe_kernel_get_name;
@@ -361,6 +377,12 @@ extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
/*! Get slm size needed for kernel local variables */
typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
+/*! Get the kernel's opencl version. */
+typedef uint32_t (gbe_kernel_get_ocl_version_cb)(gbe_kernel);
+extern gbe_kernel_get_ocl_version_cb *gbe_kernel_get_ocl_version;
+/* Kernel use device enqueue or not. */
+typedef uint32_t (gbe_kernel_use_device_enqueue_cb)(gbe_kernel);
+extern gbe_kernel_use_device_enqueue_cb *gbe_kernel_use_device_enqueue;
/*mutex to lock global llvmcontext access.*/
extern void acquireLLVMContextLock();
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 1f0ec55..1aff8b9 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -57,6 +57,7 @@ namespace gbe {
std::string accessQual;
std::string typeQual;
std::string argName;
+ uint32_t typeSize;
};
ArgInfo info;
};
@@ -125,6 +126,9 @@ namespace gbe {
INLINE bool getUseSLM(void) const { return this->useSLM; }
/*! get slm size for kernel local variable */
INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
+ /*! Return the OpenCL version */
+ INLINE void setOclVersion(uint32_t version) { this->oclVersion = version; }
+ INLINE uint32_t getOclVersion(void) const { return this->oclVersion; }
/*! Set sampler set. */
void setSamplerSet(ir::SamplerSet *from) {
samplerSet = from;
@@ -228,6 +232,12 @@ namespace gbe {
virtual uint32_t serializeToBin(std::ostream& outs);
virtual uint32_t deserializeFromBin(std::istream& ins);
virtual void printStatus(int indent, std::ostream& outs);
+ /*! Does kernel use device enqueue */
+ INLINE bool getUseDeviceEnqueue(void) const { return this->useDeviceEnqueue; }
+ /*! Change the device enqueue info of the function */
+ INLINE bool setUseDeviceEnqueue(bool useDeviceEnqueue) {
+ return this->useDeviceEnqueue = useDeviceEnqueue;
+ }
protected:
friend class Context; //!< Owns the kernels
@@ -240,6 +250,7 @@ namespace gbe {
uint32_t simdWidth; //!< SIMD size for the kernel (lane number)
uint32_t stackSize; //!< Stack size (0 if unused)
uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused)
+ uint32_t oclVersion; //!< Opencl Version (120 for 1.2, 200 for 2.0)
bool useSLM; //!< SLM requires a special HW config
uint32_t slmSize; //!< slm size for kernel variable
Context *ctx; //!< Save context after compiler to alloc constant buffer curbe
@@ -249,6 +260,7 @@ namespace gbe {
ir::ProfilingInfo *profilingInfo; //!< Copy from the corresponding function.
uint32_t compileWgSize[3]; //!< required work group size by kernel attribute.
std::string functionAttributes; //!< function attribute qualifiers combined.
+ bool useDeviceEnqueue; //!< Has device enqueue?
GBE_CLASS(Kernel); //!< Use custom allocators
};
@@ -285,6 +297,12 @@ namespace gbe {
}
return kernel;
}
+
+ const char *getDeviceEnqueueKernelName(uint32_t index) const {
+ if(index >= blockFuncs.size())
+ return NULL;
+ return blockFuncs[index].c_str();
+ }
/*! Build a program from a ir::Unit */
bool buildFromUnit(const ir::Unit &unit, std::string &error);
/*! Buils a program from a LLVM source code */
@@ -296,6 +314,8 @@ namespace gbe {
/*! Get the content of global constant arrays */
void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
+ uint32_t getGlobalRelocCount(void) const { return relocTable->getCount(); }
+ void getGlobalRelocTable(char *p) const { relocTable->getData(p); }
static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
@@ -327,6 +347,10 @@ namespace gbe {
map<std::string, Kernel*> kernels;
/*! Global (constants) outside any kernel */
ir::ConstantSet *constantSet;
+ /*! relocation table */
+ ir::RelocTable *relocTable;
+ /*! device enqueue functions */
+ vector<std::string> blockFuncs;
/*! Use custom allocators */
GBE_CLASS(Program);
};
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
index 34d04dd..64cacd9 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -23,6 +23,7 @@
#include "ir/constant.cpp"
#include "ir/printf.cpp"
#include "ir/profiling.cpp"
+#include "ir/reloc.cpp"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wunused-variable"
@@ -39,6 +40,7 @@ struct BinInterpCallBackInitializer
gbe_program_get_kernel_num = gbe::programGetKernelNum;
gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
gbe_program_get_kernel = gbe::programGetKernel;
+ gbe_program_get_device_enqueue_kernel_name = gbe::programGetDeviceEnqueueKernelName;
gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
gbe_kernel_get_code = gbe::kernelGetCode;
gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
@@ -62,8 +64,11 @@ struct BinInterpCallBackInitializer
gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
gbe_program_delete = gbe::programDelete;
gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+ gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+ gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
gbe_kernel_get_image_data = gbe::kernelGetImageData;
+ gbe_kernel_get_ocl_version = gbe::kernelGetOclVersion;
gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
gbe_get_profiling_bti = gbe::kernelGetProfilingBTI;
gbe_dup_profiling = gbe::kernelDupProfiling;
@@ -73,6 +78,7 @@ struct BinInterpCallBackInitializer
gbe_dup_printfset = gbe::kernelDupPrintfSet;
gbe_release_printf_info = gbe::kernelReleasePrintfSet;
gbe_output_printf = gbe::kernelOutputPrintf;
+ gbe_kernel_use_device_enqueue = gbe::kernelUseDeviceEnqueue;
}
~BinInterpCallBackInitializer() {
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index 54ae3f1..f16f5b7 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -27,8 +27,7 @@
namespace gbe {
namespace ir {
- void ConstantSet::append(const char *data,
- const std::string &name,
+ void ConstantSet::append(const std::string &name,
uint32_t size,
uint32_t alignment)
{
@@ -36,8 +35,7 @@ namespace ir {
const uint32_t padding = offset - this->data.size();
const Constant constant(name, size, alignment, offset);
constants.push_back(constant);
- for (uint32_t i = 0; i < padding; ++i) this->data.push_back(0);
- for (uint32_t i = 0; i < size; ++i) this->data.push_back(data[i]);
+ this->data.resize(padding + size + this->data.size());
}
#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index c9080b8..0835fad 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -69,7 +69,7 @@ namespace ir {
{
public:
/*! Append a new constant in the constant set */
- void append(const char*, const std::string&, uint32_t size, uint32_t alignment);
+ void append(const std::string&, uint32_t size, uint32_t alignment);
/*! Number of constants */
size_t getConstantNum(void) const { return constants.size(); }
/*! Get a special constant */
@@ -91,6 +91,11 @@ namespace ir {
for (size_t i = 0; i < data.size(); i ++)
mem[i] = data[i];
}
+ void setData(char *mem, int offset, int size) {
+ for (int i = 0; i < size; i++) {
+ data[i+offset] = mem[i];
+ }
+ }
ConstantSet() {}
ConstantSet(const ConstantSet& other) : Serializable(other),
data(other.data), constants(other.constants) {}
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 2fe080a..4c19a42 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -44,7 +44,7 @@ namespace ir {
Function::Function(const std::string &name, const Unit &unit, Profile profile) :
name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0),
- wgBroadcastSLM(-1), tidMapSLM(-1)
+ wgBroadcastSLM(-1), tidMapSLM(-1), useDeviceEnqueue(false)
{
initProfile(*this);
samplerSet = GBE_NEW(SamplerSet);
@@ -62,6 +62,10 @@ namespace ir {
return unit.getPointerFamily();
}
+ uint32_t Function::getOclVersion(void) const {
+ return unit.getOclVersion();
+ }
+
void Function::addLoop(LabelIndex preheader,
int parent,
const vector<LabelIndex> &bbs,
@@ -353,6 +357,7 @@ namespace ir {
out << "structure." << input.size;
break;
case FunctionArgument::IMAGE: out << "image"; break;
+ case FunctionArgument::PIPE: out << "pipe"; break;
default: break;
}
out << " %" << input.reg << " " << input.name << std::endl;
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 71a6d07..5fcb14a 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -170,8 +170,9 @@ namespace ir {
LOCAL_POINTER = 2, // __local
VALUE = 3, // int, float
STRUCTURE = 4, // struct foo
- IMAGE = 5, // image*d_t
- SAMPLER = 6
+ IMAGE = 5, // image*d_t
+ SAMPLER = 6,
+ PIPE = 7 // pipe
};
struct InfoFromLLVM { // All the info about passed by llvm, using -cl-kernel-arg-info
@@ -181,6 +182,7 @@ namespace ir {
std::string accessQual;
std::string typeQual;
std::string argName; // My different from arg->getName()
+ uint32_t typeSize;
// only llvm-3.6 or later has kernel_arg_base_type in metadata.
@@ -235,6 +237,9 @@ namespace ir {
isImage2dT() || isImage2dArrayT() || isImage3dT();
}
+ bool isPipeType() const {
+ return typeQual.compare("pipe") == 0;
+ }
};
/*! Create a function input argument */
@@ -551,6 +556,13 @@ namespace ir {
}
/*! Output the control flow graph to .dot file */
void outputCFG();
+ uint32_t getOclVersion(void) const;
+ /*! Does it use device enqueue */
+ INLINE bool getUseDeviceEnqueue(void) const { return this->useDeviceEnqueue; }
+ /*! Change the device enqueue infor of the function */
+ INLINE bool setUseDeviceEnqueue(bool useDeviceEnqueue) {
+ return this->useDeviceEnqueue = useDeviceEnqueue;
+ }
private:
friend class Context; //!< Can freely modify a function
std::string name; //!< Function name
@@ -578,6 +590,7 @@ namespace ir {
std::string functionAttributes; //!< function attribute qualifiers combined.
int32_t wgBroadcastSLM; //!< Used for broadcast the workgroup value.
int32_t tidMapSLM; //!< Used to store the map between groupid and hw thread.
+ bool useDeviceEnqueue; //!< Has device enqueue?
GBE_CLASS(Function); //!< Use custom allocator
};
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index ed64580..f0c3957 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1070,18 +1070,20 @@ namespace ir {
public TupleDstPolicy<MediaBlockReadInstruction>
{
public:
- INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum) {
+ INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type) {
this->opcode = OP_MBREAD;
this->dst = dst;
this->dstNum = vec_size;
this->src = srcTuple;
this->srcNum = srcNum;
this->imageIdx = imageIdx;
+ this->type = type;
}
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const {
this->outOpcode(out);
- out << (int)this->getVectorSize();
+ out << "." << type << "."
+ << (int)this->getVectorSize();
out << " {";
for (uint32_t i = 0; i < dstNum; ++i)
out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
@@ -1092,12 +1094,14 @@ namespace ir {
}
INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
+ INLINE Type getType(void) const { return this->type; }
Tuple src;
Tuple dst;
uint8_t imageIdx;
uint8_t srcNum;
uint8_t dstNum;
+ Type type;
};
class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
@@ -1107,17 +1111,19 @@ namespace ir {
{
public:
- INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
+ INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
this->opcode = OP_MBWRITE;
this->src = srcTuple;
this->srcNum = srcNum;
this->imageIdx = imageIdx;
this->vec_size = vec_size;
+ this->type = type;
}
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const {
this->outOpcode(out);
- out << (int)this->getVectorSize()
+ out << "." << type << "."
+ << (int)this->getVectorSize()
<< " 2D surface id " << (int)this->getImageIndex()
<< " byte coord x %" << this->getSrc(fn, 0)
<< " row coord y %" << this->getSrc(fn, 1);
@@ -1128,12 +1134,14 @@ namespace ir {
}
INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
+ INLINE Type getType(void) const { return this->type; }
Tuple src;
Register dst[0];
uint8_t imageIdx;
uint8_t srcNum;
uint8_t vec_size;
+ Type type;
};
#undef ALIGNED_INSTRUCTION
@@ -1349,10 +1357,11 @@ namespace ir {
{
if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
return false;
- if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+ const RegisterFamily family = getFamily(this->type);
+ if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
return false;
for (uint32_t srcID = 0; srcID < srcNum-1u; ++srcID)
- if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
+ if (UNLIKELY(checkRegisterData(family, getSrc(fn, srcID+1u), fn, whyNot) == false))
return false;
return true;
@@ -1466,7 +1475,8 @@ namespace ir {
SYNC_LOCAL_READ_FENCE |
SYNC_LOCAL_WRITE_FENCE |
SYNC_GLOBAL_READ_FENCE |
- SYNC_GLOBAL_WRITE_FENCE;
+ SYNC_GLOBAL_WRITE_FENCE |
+ SYNC_IMAGE_FENCE;
if (UNLIKELY(this->parameters > maxParams)) {
whyNot = "Invalid parameters for sync instruction";
return false;
@@ -1493,8 +1503,9 @@ namespace ir {
INLINE bool SimdShuffleInstruction::wellFormed(const Function &fn, std::string &whyNot) const
{
- if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && this->type != TYPE_FLOAT)) {
- whyNot = "Only support S32/U32/FLOAT type";
+ if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && this->type != TYPE_FLOAT &&
+ this->type != TYPE_U16 && this->type != TYPE_S16)) {
+ whyNot = "Only support S16/U16/S32/U32/FLOAT type";
return false;
}
@@ -1643,12 +1654,8 @@ namespace ir {
whyNot = "Wrong number of source.";
return false;
} else {
- const RegisterFamily fam = fn.getPointerFamily();
- for (uint32_t srcID = 1; srcID < this->srcNum; ++srcID) {
- const Register regID = fn.getRegister(src, srcID);
- if (UNLIKELY(checkRegisterData(fam, regID, fn, whyNot) == false))
- return false;
- }
+ if (UNLIKELY(checkRegisterData(FAMILY_DWORD, fn.getRegister(src, 1), fn, whyNot) == false))
+ return false;
}
break;
default:
@@ -1714,6 +1721,31 @@ namespace ir {
INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
this->outOpcode(out);
out << "." << AS;
+
+#define OUT_ATOMIC_OP(TYPE) \
+ case ATOMIC_OP_##TYPE: \
+ { out << "." << #TYPE; \
+ break; \
+ }
+ switch(atomicOp)
+ {
+ OUT_ATOMIC_OP(AND)
+ OUT_ATOMIC_OP(OR)
+ OUT_ATOMIC_OP(XOR)
+ OUT_ATOMIC_OP(XCHG)
+ OUT_ATOMIC_OP(INC)
+ OUT_ATOMIC_OP(DEC)
+ OUT_ATOMIC_OP(ADD)
+ OUT_ATOMIC_OP(SUB)
+ OUT_ATOMIC_OP(IMAX)
+ OUT_ATOMIC_OP(IMIN)
+ OUT_ATOMIC_OP(UMAX)
+ OUT_ATOMIC_OP(UMIN)
+ OUT_ATOMIC_OP(CMPXCHG)
+ default:
+ out << "." << "INVALID";
+ assert(0);
+ };
out << " %" << this->getDst(fn, 0);
out << " {" << "%" << this->getSrc(fn, 0) << "}";
for (uint32_t i = 1; i < srcNum; ++i)
@@ -1826,7 +1858,7 @@ namespace ir {
}
static const char *syncStr[syncFieldNum] = {
- "workgroup", "local_read", "local_write", "global_read", "global_write"
+ "workgroup", "local_read", "local_write", "global_read", "global_write", "image"
};
INLINE void SyncInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1885,7 +1917,8 @@ namespace ir {
}
out << " %" << this->getDst(fn, 0);
- out << " %" << this->getSrc(fn, 0);
+ for (uint32_t i = 0; i < this->getSrcNum(); ++i)
+ out << " %" << this->getSrc(fn, i);
if (this->workGroupOp == WORKGROUP_OP_BROADCAST) {
do {
@@ -1910,7 +1943,7 @@ namespace ir {
} while(0);
}
- out << "TheadID Map at SLM: " << this->slmAddr;
+ out << " (TheadID Map at SLM: " << this->slmAddr << ")";
}
INLINE void SubGroupInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1987,6 +2020,7 @@ namespace ir {
case MEM_CONSTANT: return out << "constant";
case MEM_PRIVATE: return out << "private";
case MEM_MIXED: return out << "mixed";
+ case MEM_GENERIC: return out << "generic";
case MEM_INVALID: return out << "invalid";
};
return out;
@@ -2374,8 +2408,10 @@ DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), getType(fn, ID))
DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex())
DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockReadInstruction, Type, getType(void), getType())
DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType())
#undef DECL_MEM_FN
@@ -2437,6 +2473,7 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg))
DECL_EMIT_FUNCTION(RNDE)
DECL_EMIT_FUNCTION(RNDU)
DECL_EMIT_FUNCTION(RNDZ)
+ DECL_EMIT_FUNCTION(BFREV)
#undef DECL_EMIT_FUNCTION
@@ -2683,12 +2720,12 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg))
return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert();
}
- Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum) {
- return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum).convert();
+ Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type) {
+ return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type).convert();
}
- Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
- return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert();
+ Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
+ return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size, type).convert();
}
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index b2b0b49..16c2045 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -62,6 +62,7 @@ namespace ir {
MEM_CONSTANT, //!< Immutable global memory
MEM_PRIVATE, //!< Per thread private memory
MEM_MIXED, //!< mixed address space pointer.
+ MEM_GENERIC, //!< mixed address space pointer.
MEM_INVALID
};
@@ -541,17 +542,19 @@ namespace ir {
SYNC_LOCAL_WRITE_FENCE = 1<<2,
SYNC_GLOBAL_READ_FENCE = 1<<3,
SYNC_GLOBAL_WRITE_FENCE = 1<<4,
- SYNC_INVALID = 1<<5
+ SYNC_IMAGE_FENCE = 1<<5,
+ SYNC_INVALID = 1<<6
};
/*! 5 bits to encode all possible synchronization capablities */
- static const uint32_t syncFieldNum = 5u;
+ static const uint32_t syncFieldNum = 6u;
/*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */
static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE;
/*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */
static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE;
+ static const uint32_t syncImageBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE | SYNC_IMAGE_FENCE;
/*! Sync instructions are used to order loads and stores for a given memory
* space and/or to serialize threads at a given point in the program
@@ -642,6 +645,7 @@ namespace ir {
static bool isClassOf(const Instruction &insn);
uint8_t getImageIndex() const;
uint8_t getVectorSize() const;
+ Type getType(void) const;
};
/*! Media Block Write. */
@@ -651,6 +655,7 @@ namespace ir {
static bool isClassOf(const Instruction &insn);
uint8_t getImageIndex() const;
uint8_t getVectorSize() const;
+ Type getType(void) const;
};
/*! Specialize the instruction. Also performs typechecking first based on the
@@ -771,6 +776,8 @@ namespace ir {
Instruction RNDZ(Type type, Register dst, Register src);
/*! bswap.type dst src */
Instruction BSWAP(Type type, Register dst, Register src);
+ /*! bfrev.type dst src */
+ Instruction BFREV(Type type, Register dst, Register src);
/*! pow.type dst src0 src1 */
Instruction POW(Type type, Register dst, Register src0, Register src1);
/*! mul.type dst src0 src1 */
@@ -886,9 +893,9 @@ namespace ir {
/*! printf */
Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num);
/*! media block read */
- Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum);
+ Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type);
/*! media block write */
- Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size);
+ Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type);
} /* namespace ir */
} /* namespace gbe */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 7d755ae..81618eb 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -116,3 +116,4 @@ DECL_INSN(SUBGROUP, SubGroupInstruction)
DECL_INSN(PRINTF, PrintfInstruction)
DECL_INSN(MBREAD, MediaBlockReadInstruction)
DECL_INSN(MBWRITE, MediaBlockWriteInstruction)
+DECL_INSN(BFREV, UnaryInstruction)
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 654a3bb..93bd96a 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -313,6 +313,7 @@ namespace ir {
Instruction *insn = const_cast<Instruction*>(use->getInstruction());
const Opcode opcode = insn->getOpcode();
const uint32_t dstNum = insn->getDstNum();
+ (void) dstNum;
GBE_ASSERT(dstNum == 1 || opcode == OP_LOAD);
const Register dst = insn->getDst();
auto it = addPtrInsns.find(derivedRegs[i]);
@@ -379,8 +380,14 @@ namespace ir {
const uint32_t offset = valueID * size;
const Register reg = load->getValue(valueID);
-
- Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddressRegister(), offset);
+ Register addressReg = load->getAddressRegister();
+ if (fn->getPointerFamily() == FAMILY_QWORD) {
+ Register tmp = fn->newRegister(FAMILY_DWORD);
+ Instruction cvt = ir::CVT(ir::TYPE_U32, ir::TYPE_U64, tmp, load->getAddressRegister());
+ cvt.insert(ins_after, &ins_after);
+ addressReg = tmp;
+ }
+ Instruction mov = ir::INDIRECT_MOV(type, reg, arg, addressReg, offset);
mov.insert(ins_after, &ins_after);
replaced = true;
}
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index b16319a..212af0d 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -35,6 +35,7 @@ namespace ir {
"group_id_0", "group_id_1", "group_id_2",
"num_groups_0", "num_groups_1", "num_groups_2",
"local_size_0", "local_size_1", "local_size_2",
+ "enqueued_local_size_0", "enqueued_local_size_1", "enqueued_local_size_2",
"global_size_0", "global_size_1", "global_size_2",
"global_offset_0", "global_offset_1", "global_offset_2",
"stack_pointer", "stack_buffer",
@@ -47,7 +48,9 @@ namespace ir {
"profiling_timestamps0", "profiling_timestamps1",
"profiling_timestamps2", "profiling_timestamps3",
"profiling_timestamps4",
- "threadid"
+ "threadid",
+ "constant_addrspace_start",
+ "stack_size", "enqueue_buffer_pointer",
};
#if GBE_DEBUG
@@ -72,13 +75,20 @@ namespace ir {
DECL_NEW_REG(FAMILY_DWORD, lsize0, 1, GBE_CURBE_LOCAL_SIZE_X);
DECL_NEW_REG(FAMILY_DWORD, lsize1, 1, GBE_CURBE_LOCAL_SIZE_Y);
DECL_NEW_REG(FAMILY_DWORD, lsize2, 1, GBE_CURBE_LOCAL_SIZE_Z);
+ DECL_NEW_REG(FAMILY_DWORD, enqlsize0, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_X);
+ DECL_NEW_REG(FAMILY_DWORD, enqlsize1, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y);
+ DECL_NEW_REG(FAMILY_DWORD, enqlsize2, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z);
DECL_NEW_REG(FAMILY_DWORD, gsize0, 1, GBE_CURBE_GLOBAL_SIZE_X);
DECL_NEW_REG(FAMILY_DWORD, gsize1, 1, GBE_CURBE_GLOBAL_SIZE_Y);
DECL_NEW_REG(FAMILY_DWORD, gsize2, 1, GBE_CURBE_GLOBAL_SIZE_Z);
DECL_NEW_REG(FAMILY_DWORD, goffset0, 1, GBE_CURBE_GLOBAL_OFFSET_X);
DECL_NEW_REG(FAMILY_DWORD, goffset1, 1, GBE_CURBE_GLOBAL_OFFSET_Y);
DECL_NEW_REG(FAMILY_DWORD, goffset2, 1, GBE_CURBE_GLOBAL_OFFSET_Z);
- DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
+ if(fn.getOclVersion() >= 200) {
+ DECL_NEW_REG(FAMILY_QWORD, stackptr, 0);
+ } else {
+ DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
+ }
DECL_NEW_REG(FAMILY_QWORD, stackbuffer, 1, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
DECL_NEW_REG(FAMILY_WORD, blockip, 0, GBE_CURBE_BLOCK_IP);
DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
@@ -95,6 +105,9 @@ namespace ir {
DECL_NEW_REG(FAMILY_DWORD, profilingts3, 0, GBE_CURBE_PROFILING_TIMESTAMP3);
DECL_NEW_REG(FAMILY_DWORD, profilingts4, 0, GBE_CURBE_PROFILING_TIMESTAMP4);
DECL_NEW_REG(FAMILY_DWORD, threadid, 1, GBE_CURBE_THREAD_ID);
+ DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE);
+ DECL_NEW_REG(FAMILY_QWORD, stacksize, 1, GBE_CURBE_STACK_SIZE);
+ DECL_NEW_REG(FAMILY_QWORD, enqueuebufptr, 1, GBE_CURBE_ENQUEUE_BUF_POINTER);
}
#undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index eab7892..ebd5142 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -53,30 +53,36 @@ namespace ir {
static const Register lsize0 = Register(9); // get_local_size(0)
static const Register lsize1 = Register(10); // get_local_size(1)
static const Register lsize2 = Register(11); // get_local_size(2)
- static const Register gsize0 = Register(12); // get_global_size(0)
- static const Register gsize1 = Register(13); // get_global_size(1)
- static const Register gsize2 = Register(14); // get_global_size(2)
- static const Register goffset0 = Register(15); // get_global_offset(0)
- static const Register goffset1 = Register(16); // get_global_offset(1)
- static const Register goffset2 = Register(17); // get_global_offset(2)
- static const Register stackptr = Register(18); // stack pointer
- static const Register stackbuffer = Register(19); // stack buffer base address.
- static const Register blockip = Register(20); // blockip
- static const Register barrierid = Register(21);// barrierid
- static const Register threadn = Register(22); // number of threads
- static const Register workdim = Register(23); // work dimention.
- static const Register zero = Register(24); // scalar register holds zero.
- static const Register one = Register(25); // scalar register holds one.
- static const Register retVal = Register(26); // helper register to do data flow analysis.
- static const Register dwblockip = Register(27); // blockip
- static const Register profilingbptr = Register(28); // buffer addr for profiling.
- static const Register profilingts0 = Register(29); // timestamp for profiling.
- static const Register profilingts1 = Register(30); // timestamp for profiling.
- static const Register profilingts2 = Register(31); // timestamp for profiling.
- static const Register profilingts3 = Register(32); // timestamp for profiling.
- static const Register profilingts4 = Register(33); // timestamp for profiling.
- static const Register threadid = Register(34); // the thread id of this thread.
- static const uint32_t regNum = 35; // number of special registers
+ static const Register enqlsize0 = Register(12); // get_local_size(0)
+ static const Register enqlsize1 = Register(13); // get_local_size(1)
+ static const Register enqlsize2 = Register(14); // get_local_size(2)
+ static const Register gsize0 = Register(15); // get_global_size(0)
+ static const Register gsize1 = Register(16); // get_global_size(1)
+ static const Register gsize2 = Register(17); // get_global_size(2)
+ static const Register goffset0 = Register(18); // get_global_offset(0)
+ static const Register goffset1 = Register(19); // get_global_offset(1)
+ static const Register goffset2 = Register(20); // get_global_offset(2)
+ static const Register stackptr = Register(21); // stack pointer
+ static const Register stackbuffer = Register(22); // stack buffer base address.
+ static const Register blockip = Register(23); // blockip
+ static const Register barrierid = Register(24);// barrierid
+ static const Register threadn = Register(25); // number of threads
+ static const Register workdim = Register(26); // work dimention.
+ static const Register zero = Register(27); // scalar register holds zero.
+ static const Register one = Register(28); // scalar register holds one.
+ static const Register retVal = Register(29); // helper register to do data flow analysis.
+ static const Register dwblockip = Register(30); // blockip
+ static const Register profilingbptr = Register(31); // buffer addr for profiling.
+ static const Register profilingts0 = Register(32); // timestamp for profiling.
+ static const Register profilingts1 = Register(33); // timestamp for profiling.
+ static const Register profilingts2 = Register(34); // timestamp for profiling.
+ static const Register profilingts3 = Register(35); // timestamp for profiling.
+ static const Register profilingts4 = Register(36); // timestamp for profiling.
+ static const Register threadid = Register(37); // the thread id of this thread.
+ static const Register constant_addrspace = Register(38); // starting address of program-scope constant
+ static const Register stacksize = Register(39); // stack buffer total size
+ static const Register enqueuebufptr = Register(40); // enqueue buffer address .
+ static const uint32_t regNum = 41; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/ir/profiling.cpp b/backend/src/ir/profiling.cpp
index 09537fa..ac61e9b 100644
--- a/backend/src/ir/profiling.cpp
+++ b/backend/src/ir/profiling.cpp
@@ -58,7 +58,7 @@ namespace ir
proLog = ((proLog << 32) & 0xffffffff00000000) + log->timestampPrologLo;
uint64_t epiLog = log->timestampEpilogHi;
epiLog = ((epiLog << 32) & 0xffffffff00000000) + log->timestampEpilogLo;
- printf(" | dispatch Mask:%4x prolog:%10lu epilog:%10lu |\n", log->dispatchMask, proLog, epiLog);
+ printf(" | dispatch Mask:%4x prolog:%10" PRIu64 " epilog:%10" PRIu64 " |\n", log->dispatchMask, proLog, epiLog);
printf(" | globalX:%4d~%4d globalY:%4d~%4d globalZ:%4d~%4d |\n", log->gidXStart, log->gidXEnd,
log->gidYStart, log->gidYEnd, log->gidZStart, log->gidZEnd);
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
index 8200c31..1e78722 100644
--- a/backend/src/ir/register.cpp
+++ b/backend/src/ir/register.cpp
@@ -35,6 +35,9 @@ namespace ir {
case FAMILY_WORD: return out << "word";
case FAMILY_DWORD: return out << "dword";
case FAMILY_QWORD: return out << "qword";
+ case FAMILY_OWORD: return out << "oword";
+ case FAMILY_HWORD: return out << "hword";
+ case FAMILY_REG: return out << "reg";
};
return out;
}
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index 11ab756..09af24e 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -45,11 +45,14 @@ namespace ir {
FAMILY_BYTE = 1,
FAMILY_WORD = 2,
FAMILY_DWORD = 3,
- FAMILY_QWORD = 4
+ FAMILY_QWORD = 4,
+ FAMILY_OWORD = 5,
+ FAMILY_HWORD = 6,
+ FAMILY_REG = 7
};
INLINE char getFamilyName(RegisterFamily family) {
- static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q'};
+ static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q', 'O', 'H', 'R'};
return registerFamilyName[family];
}
@@ -59,6 +62,7 @@ namespace ir {
case FAMILY_WORD: return 2;
case FAMILY_DWORD: return 4;
case FAMILY_QWORD: return 8;
+ case FAMILY_REG: return 32;
default: NOT_SUPPORTED;
};
return 0;
diff --git a/backend/src/ir/reloc.cpp b/backend/src/ir/reloc.cpp
new file mode 100644
index 0000000..4884610
--- /dev/null
+++ b/backend/src/ir/reloc.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file constant.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "reloc.hpp"
+
+namespace gbe {
+namespace ir {
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ /*! Implements the serialization. */
+ uint32_t RelocTable::serializeToBin(std::ostream& outs) {
+ uint32_t ret_size = 0;
+ uint32_t sz = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ sz = getCount();
+ OUT_UPDATE_SZ(sz);
+ RelocEntry entry(0, 0);
+ for (uint32_t i = 0; i < sz; ++i) {
+ entry = entries[i];
+ OUT_UPDATE_SZ(entry.refOffset);
+ OUT_UPDATE_SZ(entry.defOffset);
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+ OUT_UPDATE_SZ(ret_size);
+
+ return ret_size;
+ }
+
+ uint32_t RelocTable::deserializeFromBin(std::istream& ins) {
+ uint32_t total_size = 0;
+ uint32_t magic;
+ uint32_t refOffset;
+ uint32_t defOffset;
+ uint32_t sz = 0;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(sz); //regMap
+ for (uint32_t i = 0; i < sz; i++) {
+ IN_UPDATE_SZ(refOffset);
+ IN_UPDATE_SZ(defOffset);
+ addEntry(refOffset, defOffset);
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ uint32_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/reloc.hpp b/backend/src/ir/reloc.hpp
new file mode 100644
index 0000000..de33a8a
--- /dev/null
+++ b/backend/src/ir/reloc.hpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file reloc.cpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_RELOC_HPP__
+#define __GBE_IR_RELOC_HPP__
+
+#include "sys/vector.hpp"
+#include <string.h>
+
+namespace gbe {
+namespace ir {
+
+
+ /*! Complete unit of compilation. It contains a set of functions and a set of
+ * RelocEntry the functions may refer to.
+ */
+ struct RelocEntry {
+ RelocEntry(unsigned int rO, unsigned int dO):
+ refOffset(rO),
+ defOffset(dO) {}
+
+ unsigned int refOffset;
+ unsigned int defOffset;
+ };
+
+ class RelocTable : public NonCopyable, public Serializable
+ {
+ public:
+ void addEntry(unsigned refOffset, unsigned defOffset) {
+ entries.push_back(RelocEntry(refOffset, defOffset));
+ }
+ RelocTable() : Serializable() {}
+ RelocTable(const RelocTable& other) : Serializable(other),
+ entries(other.entries) {}
+ uint32_t getCount() { return entries.size(); }
+ void getData(char *p) {
+ if (entries.size() > 0 && p)
+ memcpy(p, entries.data(), entries.size()*sizeof(RelocEntry));
+ }
+ static const uint32_t magic_begin = TO_MAGIC('R', 'E', 'L', 'C');
+ static const uint32_t magic_end = TO_MAGIC('C', 'L', 'E', 'R');
+
+ /* format:
+ magic_begin |
+ reloc_table_size |
+ entry_0_refOffset |
+ entry_0_defOffset |
+ entry_1_refOffset |
+ entry_1_defOffset |
+ ........ |
+ entry_n_refOffset |
+ entry_n_defOffset |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual uint32_t serializeToBin(std::ostream& outs);
+ virtual uint32_t deserializeFromBin(std::istream& ins);
+ private:
+ vector<RelocEntry> entries;
+ GBE_CLASS(RelocTable);
+ };
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_RELOC_HPP__ */
+
diff --git a/backend/src/ir/type.hpp b/backend/src/ir/type.hpp
index d528859..3ac758f 100644
--- a/backend/src/ir/type.hpp
+++ b/backend/src/ir/type.hpp
@@ -86,8 +86,8 @@ namespace ir {
case FAMILY_WORD: return TYPE_U16;
case FAMILY_DWORD: return TYPE_U32;
case FAMILY_QWORD: return TYPE_U64;
- };
- return TYPE_U32;
+ default: return TYPE_U32;
+ }
}
} /* namespace ir */
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index c9cb15e..79e129d 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -30,6 +30,7 @@ namespace ir {
Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize), valid(true) {
profilingInfo = GBE_NEW(ProfilingInfo);
inProfilingMode = false;
+ oclVersion = 120;
}
Unit::~Unit(void) {
for (const auto &pair : functions) GBE_DELETE(pair.second);
@@ -50,12 +51,11 @@ namespace ir {
functions[name] = fn;
return fn;
}
- void Unit::newConstant(const char *data,
- const std::string &name,
+ void Unit::newConstant(const std::string &name,
uint32_t size,
uint32_t alignment)
{
- constantSet.append(data, name, size, alignment);
+ constantSet.append(name, size, alignment);
}
std::ostream &operator<< (std::ostream &out, const Unit &unit) {
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 10a1af6..46d7be7 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -28,7 +28,9 @@
#include "ir/register.hpp"
#include "ir/profiling.hpp"
#include "ir/printf.hpp"
+#include "ir/reloc.hpp"
#include "sys/map.hpp"
+#include <string.h>
#include "llvm/IR/Instructions.h"
@@ -39,15 +41,13 @@ namespace ir {
class Function;
class ProfilingInfo;
- /*! Complete unit of compilation. It contains a set of functions and a set of
- * constant the functions may refer to.
- */
class Unit : public NonCopyable
{
public:
typedef map<std::string, Function*> FunctionSet;
/*! Moved from printf pass */
map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs;
+ vector<std::string> blockFuncs;
/*! Create an empty unit */
Unit(PointerSize pointerSize = POINTER_32_BITS);
/*! Release everything (*including* the function pointers) */
@@ -59,7 +59,7 @@ namespace ir {
/*! Return NULL if the function already exists */
Function *newFunction(const std::string &name);
/*! Create a new constant in the constant set */
- void newConstant(const char*, const std::string&, uint32_t size, uint32_t alignment);
+ void newConstant(const std::string&, uint32_t size, uint32_t alignment);
/*! Apply the given functor on all the functions */
template <typename T>
INLINE void apply(const T &functor) const {
@@ -68,6 +68,7 @@ namespace ir {
}
/*! Return the size of the pointers manipulated */
INLINE PointerSize getPointerSize(void) const { return pointerSize; }
+ INLINE void setPointerSize(PointerSize size) { pointerSize = size; }
/*! Return the family of registers that contain pointer */
INLINE RegisterFamily getPointerFamily(void) const {
if (this->getPointerSize() == POINTER_32_BITS)
@@ -77,6 +78,8 @@ namespace ir {
}
/*! Return the constant set */
ConstantSet& getConstantSet(void) { return constantSet; }
+ const RelocTable& getRelocTable(void) const { return relocTable; }
+ RelocTable& getRelocTable(void) { return relocTable; }
/*! Return the constant set */
const ConstantSet& getConstantSet(void) const { return constantSet; }
/*! Get profiling info in this function */
@@ -87,13 +90,17 @@ namespace ir {
bool getInProfilingMode(void) const { return inProfilingMode; }
void setValid(bool value) { valid = value; }
bool getValid() { return valid; }
+ void setOclVersion(uint32_t version) { oclVersion = version; }
+ uint32_t getOclVersion() const { return oclVersion; }
private:
friend class ContextInterface; //!< Can free modify the unit
FunctionSet functions; //!< All the defined functions
ConstantSet constantSet; //!< All the constants defined in the unit
+ RelocTable relocTable;
PointerSize pointerSize; //!< Size shared by all pointers
ProfilingInfo *profilingInfo; //!< profilingInfo store the information for profiling.
GBE_CLASS(Unit);
+ uint32_t oclVersion;
bool valid;
bool inProfilingMode;
};
diff --git a/backend/src/libocl/Android.mk b/backend/src/libocl/Android.mk
index 8e45c12..08044af 100644
--- a/backend/src/libocl/Android.mk
+++ b/backend/src/libocl/Android.mk
@@ -86,4 +86,3 @@ $(shell $(HOST_OUT)/bin/llvm-link -o ${generated_sources}/../beignet.bc $(addpre
$(shell $(HOST_OUT)/bin/clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${generated_sources}/include/ --relocatable-pch -emit-pch -isysroot ${generated_sources} -x cl ${generated_sources}/include/ocl.h -o ${generated_sources}/../beignet.pch)
-
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 1d1ec68..c68ecb0 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -2,6 +2,8 @@ PROJECT(LIBOCL)
SET (OCL_OBJECT_DIR ${LIBOCL_BINARY_DIR}/${BEIGNET_INSTALL_DIR})
SET (OCL_HEADER_FILES ${OCL_OBJECT_DIR}/include/ocl_defines.h)
SET (OCL_SOURCE_FILES "")
+SET (OCL_SOURCE_FILES_12 "")
+SET (OCL_SOURCE_FILES_20 "")
ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/include/ocl_defines.h
COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/
@@ -30,11 +32,11 @@ MACRO(COPY_THE_HEADER _mod)
)
ENDIF(orgin_name STREQUAL output_name)
ENDMACRO(COPY_THE_HEADER)
-MACRO(COPY_THE_SOURCE _mod)
+MACRO(COPY_THE_SOURCE _source _mod)
# Use the python script to generate the header files.
STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/src/\\1.cl" orgin_name ${_mod})
- SET(OCL_SOURCE_FILES ${OCL_SOURCE_FILES} ${output_name})
+ SET(${_source} ${${_source}} ${output_name})
IF(orgin_name STREQUAL output_name)
ELSE(orgin_name STREQUAL output_name)
ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
@@ -50,14 +52,27 @@ ENDMACRO(COPY_THE_SOURCE)
SET (OCL_COPY_HEADERS ocl ocl_types ocl_float ocl_printf)
FOREACH(M ${OCL_COPY_HEADERS})
COPY_THE_HEADER(${M})
-ENDFOREACH(M)
+ENDFOREACH(M)
+
+SET (OCL_COPY_MODULES ocl_workitem ocl_async ocl_sync ocl_memcpy
+ ocl_memset ocl_misc ocl_geometric ocl_image ocl_work_group)
-SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy
- ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group)
FOREACH(M ${OCL_COPY_MODULES})
COPY_THE_HEADER(${M})
- COPY_THE_SOURCE(${M})
-ENDFOREACH(M)
+ COPY_THE_SOURCE(OCL_SOURCE_FILES ${M})
+ENDFOREACH(M)
+
+SET (OCL_COPY_MODULES_12 ocl_vload ocl_atom)
+FOREACH(M ${OCL_COPY_MODULES_12})
+ COPY_THE_HEADER(${M})
+ COPY_THE_SOURCE(OCL_SOURCE_FILES_12 ${M})
+ENDFOREACH(M)
+
+SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom_20 ocl_pipe ocl_enqueue)
+FOREACH(M ${OCL_COPY_MODULES_20})
+ COPY_THE_HEADER(${M})
+ COPY_THE_SOURCE(OCL_SOURCE_FILES_20 ${M})
+ENDFOREACH(M)
MACRO(GENERATE_HEADER_PY _mod)
@@ -77,11 +92,11 @@ MACRO(GENERATE_HEADER_PY _mod)
COMMENT "Generate the header by python: ${output_name}"
)
ENDMACRO(GENERATE_HEADER_PY)
-MACRO(GENERATE_SOURCE_PY _mod)
+MACRO(GENERATE_SOURCE_PY _source _mod)
STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/tmpl/\\1.tmpl.cl" tmpl_name ${_mod})
STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/script/\\1.def" def_name ${_mod})
- SET(OCL_SOURCE_FILES ${OCL_SOURCE_FILES} ${output_name})
+ SET(${_source} ${${_source}} ${output_name})
ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/
COMMAND cat ${tmpl_name} > ${output_name}
@@ -91,12 +106,24 @@ MACRO(GENERATE_SOURCE_PY _mod)
)
ENDMACRO(GENERATE_SOURCE_PY)
-SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math ocl_simd)
+SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_simd)
FOREACH(M ${OCL_PY_GENERATED_MODULES})
GENERATE_HEADER_PY(${M})
- GENERATE_SOURCE_PY(${M})
+ GENERATE_SOURCE_PY(OCL_SOURCE_FILES ${M})
ENDFOREACH(M)
+SET (OCL_PY_GENERATED_MODULES_12 ocl_math)
+FOREACH(M ${OCL_PY_GENERATED_MODULES_12})
+ GENERATE_HEADER_PY(${M})
+ GENERATE_SOURCE_PY(OCL_SOURCE_FILES_12 ${M})
+ENDFOREACH(M)
+
+SET (OCL_PY_GENERATED_MODULES_20 ocl_math_20)
+FOREACH(M ${OCL_PY_GENERATED_MODULES_20})
+ GENERATE_HEADER_PY(${M})
+ GENERATE_SOURCE_PY(OCL_SOURCE_FILES_20 ${M})
+ENDFOREACH(M)
+
MACRO(GENERATE_HEADER_BASH _mod)
# Use the python script to generate the header files.
@@ -129,15 +156,15 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES})
GENERATE_SOURCE_BASH(${M})
ENDFOREACH(M)
+SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -triple spir -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2" -D__OPENCL_C_VERSION__=120)
+SET (CLANG_OCL_FLAGS_20 -fno-builtin -ffp-contract=off -triple spir64 -cl-kernel-arg-info -fblocks -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL2.0" -D__OPENCL_C_VERSION__=200)
-SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2")
-MACRO(ADD_CL_TO_BC_TARGET _file)
+MACRO(ADD_CL_TO_BC_TARGET _file _output _clang_flag)
# CMake seems can not add pattern rule, use MACRO to replace.
- STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" output_name ${_file})
- ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+ ADD_CUSTOM_COMMAND(OUTPUT ${_output}
COMMAND mkdir -p ${OCL_OBJECT_DIR}/
- #COMMAND echo ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${LIBOCL_BINARY_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
- COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
+ #COMMAND echo ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${LIBOCL_BINARY_DIR}/include/ -emit-llvm-bc -o ${output_name} -x cl ${_file}
+ COMMAND ${CLANG_EXECUTABLE} -cc1 ${_clang_flag} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -o ${_output} -x cl ${_file}
DEPENDS ${_file} ${OCL_HEADER_FILES}
COMMENT "Compiling ${_file}"
)
@@ -145,14 +172,16 @@ ENDMACRO(ADD_CL_TO_BC_TARGET)
FOREACH(f ${OCL_SOURCE_FILES})
- ADD_CL_TO_BC_TARGET(${f})
-ENDFOREACH(f)
-
-FOREACH(f ${OCL_SOURCE_FILES})
STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
- SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name})
-ENDFOREACH(f)
+ SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name})
+ ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS}")
+ENDFOREACH(f)
+FOREACH(f ${OCL_SOURCE_FILES_12})
+ STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+ SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name})
+ ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS}")
+ENDFOREACH(f)
# handle the ll files
MACRO(COPY_THE_LL _mod)
@@ -178,42 +207,90 @@ MACRO(ADD_LL_TO_BC_TARGET M)
#COMMAND echo ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
COMMAND ${LLVM_AS_EXECUTABLE} -o ${output_name} ${srcll_name}
DEPENDS ${srcll_name}
- COMMENT "Compiling ${srcll_name}"
+ COMMENT "Compiling ${output_name}"
)
ENDMACRO(ADD_LL_TO_BC_TARGET)
-SET (OCL_LL_MODULES ocl_barrier ocl_clz)
-FOREACH(f ${OCL_LL_MODULES})
+SET (OCL_LL_MODULES_12 ocl_barrier ocl_clz ocl_ctz)
+FOREACH(f ${OCL_LL_MODULES_12})
COPY_THE_LL(${f})
ADD_LL_TO_BC_TARGET(${f})
STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
- SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name})
-ENDFOREACH(f)
-
+ SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name})
+ENDFOREACH(f)
ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.bc
COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
- #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
- COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES}
- DEPENDS ${OCL_BC_FILES}
+ #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES12}
+ COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES_12}
+ DEPENDS ${OCL_BC_FILES_12}
COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet.bc"
)
ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.local.pch
COMMAND mkdir -p ${OCL_OBJECT_DIR}
- COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
+ COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
DEPENDS ${OCL_HEADER_FILES}
COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.local.pch"
)
ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.pch
COMMAND mkdir -p ${OCL_OBJECT_DIR}
- COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
+ COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
DEPENDS ${OCL_HEADER_FILES}
COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.pch"
)
+if (ENABLE_OPENCL_20)
+ FOREACH(f ${OCL_SOURCE_FILES})
+ STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1_20.bc" bc_name ${f})
+ SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name})
+ ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS_20}")
+ ENDFOREACH(f)
+
+ FOREACH(f ${OCL_SOURCE_FILES_20})
+ STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+ SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name})
+ ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS_20}")
+ ENDFOREACH(f)
+
+ SET (OCL_LL_MODULES_20 ocl_barrier_20 ocl_clz_20 ocl_ctz_20 ocl_atomic_20)
+ FOREACH(f ${OCL_LL_MODULES_20})
+ COPY_THE_LL(${f})
+ ADD_LL_TO_BC_TARGET(${f})
+ STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+ SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name})
+ ENDFOREACH(f)
+
+ ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet_20.bc
+ COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
+ #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
+ COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet_20.bc ${OCL_BC_FILES_20}
+ DEPENDS ${OCL_BC_FILES_20}
+ COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet_20.bc"
+ )
+
+ ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet_20.local.pch
+ COMMAND mkdir -p ${OCL_OBJECT_DIR}
+ COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS_20} -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet_20.local.pch
+ DEPENDS ${OCL_HEADER_FILES}
+ COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet_20.local.pch"
+ )
+
+ ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet_20.pch
+ COMMAND mkdir -p ${OCL_OBJECT_DIR}
+ COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS_20} -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet_20.pch
+ DEPENDS ${OCL_HEADER_FILES}
+ COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet_20.pch"
+ )
+endif (ENABLE_OPENCL_20)
+
+
+if (ENABLE_OPENCL_20)
+add_custom_target(beignet_bitcode ALL DEPENDS ${OCL_OBJECT_DIR}/beignet.bc ${OCL_OBJECT_DIR}/beignet_20.bc ${OCL_OBJECT_DIR}/beignet.pch ${OCL_OBJECT_DIR}/beignet_20.pch ${OCL_OBJECT_DIR}/beignet.local.pch ${OCL_OBJECT_DIR}/beignet_20.local.pch)
+else(ENABLE_OPENCL_20)
add_custom_target(beignet_bitcode ALL DEPENDS ${OCL_OBJECT_DIR}/beignet.bc ${OCL_OBJECT_DIR}/beignet.pch ${OCL_OBJECT_DIR}/beignet.local.pch)
+endif (ENABLE_OPENCL_20)
SET (OCL_OBJECT_DIR ${OCL_OBJECT_DIR} PARENT_SCOPE)
SET (OCL_HEADER_FILES ${OCL_HEADER_FILES} PARENT_SCOPE)
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 5e3a788..2548cb7 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -83,21 +83,29 @@
#include "ocl_types.h"
#include "ocl_as.h"
#include "ocl_async.h"
-#include "ocl_atom.h"
#include "ocl_common.h"
#include "ocl_convert.h"
#include "ocl_float.h"
#include "ocl_geometric.h"
#include "ocl_image.h"
#include "ocl_integer.h"
-#include "ocl_math.h"
#include "ocl_memcpy.h"
#include "ocl_memset.h"
#include "ocl_misc.h"
#include "ocl_printf.h"
#include "ocl_relational.h"
#include "ocl_sync.h"
+#if (__OPENCL_C_VERSION__ >= 200)
+#include "ocl_vload_20.h"
+#include "ocl_atom_20.h"
+#include "ocl_pipe.h"
+#include "ocl_math_20.h"
+#include "ocl_enqueue.h"
+#else
#include "ocl_vload.h"
+#include "ocl_atom.h"
+#include "ocl_math.h"
+#endif
#include "ocl_workitem.h"
#include "ocl_simd.h"
#include "ocl_work_group.h"
@@ -114,6 +122,7 @@
#define cl_khr_fp16
#define cl_khr_3d_image_writes
#define cl_intel_subgroups
+#define cl_intel_subgroups_short
#pragma OPENCL EXTENSION cl_khr_fp64 : disable
#pragma OPENCL EXTENSION cl_khr_fp16 : disable
diff --git a/backend/src/libocl/include/ocl_atom_20.h b/backend/src/libocl/include/ocl_atom_20.h
new file mode 100644
index 0000000..9e34c31
--- /dev/null
+++ b/backend/src/libocl/include/ocl_atom_20.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_ATOM20_H__
+#define __OCL_ATOM20_H__
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Atomic functions
+/////////////////////////////////////////////////////////////////////////////
+
+OVERLOADABLE uint atomic_add(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_add(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_add(volatile __global int *p, int val);
+OVERLOADABLE int atomic_add(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_sub(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_sub(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_sub(volatile __global int *p, int val);
+OVERLOADABLE int atomic_sub(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_and(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_and(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_and(volatile __global int *p, int val);
+OVERLOADABLE int atomic_and(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_or(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_or(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_or(volatile __global int *p, int val);
+OVERLOADABLE int atomic_or(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_xor(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xor(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xor(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xor(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_xchg(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xchg(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xchg(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xchg(volatile __local int *p, int val);
+
+OVERLOADABLE int atomic_min(volatile __global int *p, int val);
+OVERLOADABLE int atomic_min(volatile __local int *p, int val);
+
+OVERLOADABLE int atomic_max(volatile __global int *p, int val);
+OVERLOADABLE int atomic_max(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_min(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_min(volatile __local uint *p, uint val);
+
+OVERLOADABLE uint atomic_max(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_max(volatile __local uint *p, uint val);
+
+OVERLOADABLE float atomic_xchg (volatile __global float *p, float val);
+OVERLOADABLE float atomic_xchg (volatile __local float *p, float val);
+
+OVERLOADABLE uint atomic_inc (volatile __global uint *p);
+OVERLOADABLE uint atomic_inc (volatile __local uint *p);
+OVERLOADABLE int atomic_inc (volatile __global int *p);
+OVERLOADABLE int atomic_inc (volatile __local int *p);
+
+OVERLOADABLE uint atomic_dec (volatile __global uint *p);
+OVERLOADABLE uint atomic_dec (volatile __local uint *p);
+OVERLOADABLE int atomic_dec (volatile __global int *p);
+OVERLOADABLE int atomic_dec (volatile __local int *p);
+
+OVERLOADABLE uint atomic_cmpxchg (volatile __global uint *p, uint cmp, uint val);
+OVERLOADABLE uint atomic_cmpxchg (volatile __local uint *p, uint cmp, uint val);
+OVERLOADABLE int atomic_cmpxchg (volatile __global int *p, int cmp, int val);
+OVERLOADABLE int atomic_cmpxchg (volatile __local int *p, int cmp, int val);
+
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
+//OpenCL 2.0 features
+#define ATOMIC_GEN_FUNCTIONS(ATYPE, CTYPE, POSTFIX) \
+CTYPE __gen_ocl_atomic_exchange##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_add##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_sub##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_or##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_xor##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_and##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope);\
+CTYPE __gen_ocl_atomic_compare_exchange_strong##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope); \
+CTYPE __gen_ocl_atomic_compare_exchange_weak##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope);
+
+ATOMIC_GEN_FUNCTIONS(atomic_int, int, 32)
+#ifndef DISABLE_ATOMIC_INT64
+ATOMIC_GEN_FUNCTIONS(atomic_long, long, 64)
+#endif
+float __gen_ocl_atomic_exchangef(volatile atomic_int *p, float val, int order, int scope);
+float __gen_ocl_atomic_fetch_addf(volatile atomic_int *p, float val, int order, int scope);
+
+#undef ATOMIC_GEN_FUNCTIONS
+
+/* only used to initialize global address space */
+//#define ATOMIC_VAR_INIT(C value)
+#define ATOMIC_VAR_INIT
+#define ATOMIC_FLAG_INIT 0
+
+//store
+#define ATOMIC_FUNCTIONS(ATYPE, CTYPE, MTYPE1, MTYPE2) \
+OVERLOADABLE void atomic_init(volatile ATYPE *object, CTYPE desired); \
+OVERLOADABLE void atomic_store(volatile ATYPE *object, CTYPE desired); \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order); \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_load(volatile ATYPE *object); \
+OVERLOADABLE CTYPE atomic_load_explicit(volatile ATYPE *object, memory_order order); \
+OVERLOADABLE CTYPE atomic_load_explicit(volatile ATYPE *object, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_exchange(volatile ATYPE *object, CTYPE desired); \
+OVERLOADABLE CTYPE atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope); \
+OVERLOADABLE bool atomic_compare_exchange_strong(volatile ATYPE *object, CTYPE *expected, CTYPE desired); \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure); \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope); \
+OVERLOADABLE bool atomic_compare_exchange_weak(volatile ATYPE *object, CTYPE *expected, CTYPE desired); \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure); \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_add(volatile ATYPE *object, MTYPE1 desired); \
+OVERLOADABLE CTYPE atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_sub(volatile ATYPE *object, MTYPE1 desired); \
+OVERLOADABLE CTYPE atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_or(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_xor(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_and(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_min(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_max(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);
+
+ATOMIC_FUNCTIONS(atomic_int, int, int, int)
+ATOMIC_FUNCTIONS(atomic_uint, uint, uint, uint)
+#ifndef DISABLE_ATOMIC_INT64
+ATOMIC_FUNCTIONS(atomic_long, long, long, long)
+ATOMIC_FUNCTIONS(atomic_ulong, ulong, ulong, ulong)
+#endif
+ATOMIC_FUNCTIONS(atomic_float, float, float, float)
+#undef ATOMIC_FUNCTIONS
+
+
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
+#endif /* __OCL_ATOM20_H__ */
diff --git a/backend/src/libocl/include/ocl_enqueue.h b/backend/src/libocl/include/ocl_enqueue.h
new file mode 100644
index 0000000..6479df7
--- /dev/null
+++ b/backend/src/libocl/include/ocl_enqueue.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __OCL_ENQUEUE_H__
+#define __OCL_ENQUEUE_H__
+
+#include "ocl_types.h"
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0
+#define CLK_ENQUEUE_FLAGS_NO_WAIT 1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 2
+#define CLK_SUCCESS 0
+#define CL_COMPLETE 0
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0
+
+struct ndrange_info_t {
+ int type;
+ int global_work_size[3];
+ int local_work_size[3];
+ int global_work_offset[3];
+};
+
+struct Block_literal {
+ void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+ int flags;
+ int reserved;
+ __global void (*invoke)(void *, ...);
+ struct Block_descriptor_1 {
+ unsigned long int reserved; // NULL
+ unsigned long int size; // sizeof(struct Block_literal_1)
+ // optional helper functions
+ void (*copy_helper)(void *dst, void *src); // IFF (1<<25)
+ void (*dispose_helper)(void *src); // IFF (1<<25)
+ // required ABI.2010.3.16
+ const char *signature; // IFF (1<<30)
+ } *descriptor;
+ // imported variables
+};
+
+clk_event_t create_user_event(void);
+void retain_event(clk_event_t event);
+void release_event(clk_event_t event);
+void set_user_event_status(clk_event_t event, int status);
+bool is_valid_event(clk_event_t event);
+void capture_event_profiling_info(clk_event_t event, int name, global void *value);
+
+uint __get_kernel_work_group_size_impl(__private void *block);
+uint __get_kernel_preferred_work_group_multiple_impl(__private void *block);
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+ uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+ clk_event_t *event_ret, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, __private void *block, uint size0, ...);
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+ uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+ clk_event_t *event_ret, __private void *block, uint size0, ...);
+
+queue_t get_default_queue(void);
+int __gen_enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void), int size);
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes);
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size);
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2]);
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3]);
+
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret);
+#endif
diff --git a/backend/src/libocl/include/ocl_image.h b/backend/src/libocl/include/ocl_image.h
index cdb3411..5a679aa 100644
--- a/backend/src/libocl/include/ocl_image.h
+++ b/backend/src/libocl/include/ocl_image.h
@@ -20,28 +20,77 @@
#include "ocl_types.h"
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagei(write_only image1d_t cl_image, int coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image1d_t cl_image, float coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imageui(write_only image1d_t cl_image, int coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_t cl_image, float coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagef(write_only image1d_t cl_image, int coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image1d_t cl_image, float coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imagei(write_only image1d_buffer_t cl_image, int coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, int coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, float coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imagef(write_only image1d_buffer_t cl_image, int coord, float4 color);
+#define int1 int
+#define float1 float
+
+#define DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
+ OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, int##N coord); \
+ OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, float##N coord);
+
+#define DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
+ OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, int##N coord);
+
+#define DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
+ OVERLOADABLE void write_image ## SUFFIX(IMG_TYPE cl_image, int##N coord, DATA_YPE color);
+
+#define DECL_IMAGE_TYPE_READ_NO_SAMPLE(IMG_TYPE, N)\
+ DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, int4, i, N) \
+ DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, uint4, ui, N) \
+ DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, float4, f, N)
+
+#define DECL_IMAGE_TYPE_READ_SAMPLE(IMG_TYPE, N)\
+ DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, int4, i, N) \
+ DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, uint4, ui, N) \
+ DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, float4, f, N)
+
+#define DECL_IMAGE_TYPE_WRITE(IMG_TYPE, N)\
+ DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, int4, i, N) \
+ DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, uint4, ui, N) \
+ DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, float4, f, N)
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE(IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_write IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_SAMPLE(read_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_WRITE(read_write IMG_TYPE, N)
+#else
+#define DECL_IMAGE(IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_SAMPLE(read_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N)
+#endif
+
+DECL_IMAGE(image1d_t, 1)
+DECL_IMAGE(image2d_t, 2)
+DECL_IMAGE(image1d_array_t, 2)
+DECL_IMAGE(image3d_t, 3)
+DECL_IMAGE(image3d_t, 4)
+DECL_IMAGE(image2d_array_t, 3)
+DECL_IMAGE(image2d_array_t, 4)
+
+#undef DECL_IMAGE
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE(IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_write IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_WRITE(read_write IMG_TYPE, N)
+#else
+#define DECL_IMAGE(IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_only IMG_TYPE, N) \
+ DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N)
+#endif
+
+DECL_IMAGE(image1d_buffer_t, 1)
+
+#undef int1
+#undef float1
+#undef DECL_IMAGE_TYPE_READ_NO_SAMPLE
+#undef DECL_IMAGE_TYPE_WRITE
+#undef DECL_IMAGE
OVERLOADABLE int get_image_channel_data_type(read_only image1d_t image);
OVERLOADABLE int get_image_channel_order(read_only image1d_t image);
@@ -51,37 +100,6 @@ OVERLOADABLE int get_image_channel_data_type(read_only image1d_buffer_t image);
OVERLOADABLE int get_image_channel_order(read_only image1d_buffer_t image);
OVERLOADABLE int get_image_width(read_only image1d_buffer_t image);
-OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imagei(write_only image2d_t cl_image, int2 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image2d_t cl_image, float2 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imageui(write_only image2d_t cl_image, int2 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image2d_t cl_image, float2 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imagef(write_only image2d_t cl_image, int2 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image2d_t cl_image, float2 coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imagei(write_only image1d_array_t cl_image, int2 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image1d_array_t cl_image, float2 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imageui(write_only image1d_array_t cl_image, int2 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_array_t cl_image, float2 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imagef(write_only image1d_array_t cl_image, int2 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image1d_array_t cl_image, float2 coord, float4 color);
-
OVERLOADABLE int get_image_channel_data_type(read_only image2d_t image);
OVERLOADABLE int get_image_channel_order(read_only image2d_t image);
OVERLOADABLE int get_image_width(read_only image2d_t image);
@@ -93,69 +111,6 @@ OVERLOADABLE int get_image_channel_order(read_only image1d_array_t image);
OVERLOADABLE int get_image_width(read_only image1d_array_t image);
OVERLOADABLE size_t get_image_array_size(read_only image1d_array_t image);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, int4 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, float4 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, int4 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, float4 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, int4 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, float4 coord, float4 color);
-
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, int3 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, float3 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, int3 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, float3 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, int3 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, float3 coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, int4 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, float4 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, int4 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, float4 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, int4 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, float4 coord, float4 color);
-
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, int3 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, float3 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, int3 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, float3 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, int3 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, float3 coord, float4 color);
-
OVERLOADABLE int get_image_channel_data_type(read_only image3d_t image);
OVERLOADABLE int get_image_channel_order(read_only image3d_t image);
OVERLOADABLE int get_image_width(read_only image3d_t image);
@@ -205,4 +160,39 @@ OVERLOADABLE int2 get_image_dim(write_only image2d_array_t image);
OVERLOADABLE size_t get_image_array_size(write_only image2d_array_t image);
#endif
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_channel_data_type(read_write image1d_t image);
+OVERLOADABLE int get_image_channel_order(read_write image1d_t image);
+OVERLOADABLE int get_image_width(read_write image1d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_write image1d_buffer_t image);
+OVERLOADABLE int get_image_channel_order(read_write image1d_buffer_t image);
+OVERLOADABLE int get_image_width(read_write image1d_buffer_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_write image2d_t image);
+OVERLOADABLE int get_image_channel_order(read_write image2d_t image);
+OVERLOADABLE int get_image_width(read_write image2d_t image);
+OVERLOADABLE int get_image_height(read_write image2d_t image);
+OVERLOADABLE int2 get_image_dim(read_write image2d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_write image1d_array_t image);
+OVERLOADABLE int get_image_channel_order(read_write image1d_array_t image);
+OVERLOADABLE int get_image_width(read_write image1d_array_t image);
+OVERLOADABLE size_t get_image_array_size(read_write image1d_array_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_write image3d_t image);
+OVERLOADABLE int get_image_channel_order(read_write image3d_t image);
+OVERLOADABLE int get_image_width(read_write image3d_t image);
+OVERLOADABLE int get_image_height(read_write image3d_t image);
+OVERLOADABLE int get_image_depth(read_write image3d_t image);
+OVERLOADABLE int4 get_image_dim(read_write image3d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(read_write image2d_array_t image);
+OVERLOADABLE int get_image_channel_order(read_write image2d_array_t image);
+OVERLOADABLE int get_image_width(read_write image2d_array_t image);
+OVERLOADABLE int get_image_height(read_write image2d_array_t image);
+OVERLOADABLE int2 get_image_dim(read_write image2d_array_t image);
+OVERLOADABLE size_t get_image_array_size(read_write image2d_array_t image);
+#endif
+
#endif
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 7d4abab..2c0d700 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -152,4 +152,13 @@ uint8 __gen_ocl_vme(image2d_t, image2d_t,
uint, uint, uint, uint,
uint, uint, uint, uint,
int, int, int);
+
+bool __gen_ocl_in_local(size_t p);
+bool __gen_ocl_in_private(size_t p);
+
+#if (__OPENCL_C_VERSION__ >= 200)
+local void *__to_local(generic void *p);
+global void *__to_global(generic void *p);
+private void *__to_private(generic void *p);
+#endif
#endif
diff --git a/backend/src/libocl/include/ocl_pipe.h b/backend/src/libocl/include/ocl_pipe.h
new file mode 100644
index 0000000..349b1dd
--- /dev/null
+++ b/backend/src/libocl/include/ocl_pipe.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_PIPE_H__
+#define __OCL_PIPE_H__
+
+#include "ocl_types.h"
+#include "ocl_work_group.h"
+#include "ocl_simd.h"
+
+/* The pipe read function. */
+int __read_pipe_2(pipe int p, __generic void* dst);
+int __read_pipe_4(pipe int p, reserve_id_t id, uint index, void* dst);
+reserve_id_t __reserve_read_pipe(pipe int p, uint num);
+void __commit_read_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __work_group_reserve_read_pipe(pipe int p, uint num);
+void __work_group_commit_read_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __sub_group_reserve_read_pipe(pipe int p, uint num);
+void __sub_group_commit_read_pipe(pipe int p, reserve_id_t rid);
+
+/* The pipe write function. */
+int __write_pipe_2(pipe int p, __generic void* src);
+int __write_pipe_4(pipe int p, reserve_id_t id, uint index, void* src);
+reserve_id_t __reserve_write_pipe(pipe int p, uint num);
+void __commit_write_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __work_group_reserve_write_pipe(pipe int p, uint num);
+void __work_group_commit_write_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __sub_group_reserve_write_pipe(pipe int p, uint num);
+void __sub_group_commit_write_pipe(pipe int p, reserve_id_t rid);
+
+/* The reserve_id_t function. */
+bool is_valid_reserve_id(reserve_id_t rid);
+
+/* The pipe query function. */
+uint __get_pipe_num_packets(pipe int p);
+uint __get_pipe_max_packets(pipe int p);
+#endif
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
index 312928e..22ff89a 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -23,14 +23,11 @@
/////////////////////////////////////////////////////////////////////////////
// Synchronization functions
/////////////////////////////////////////////////////////////////////////////
-#define CLK_LOCAL_MEM_FENCE (1 << 0)
-#define CLK_GLOBAL_MEM_FENCE (1 << 1)
-
-typedef uint cl_mem_fence_flags;
OVERLOADABLE void barrier(cl_mem_fence_flags flags);
OVERLOADABLE void debugwait(void);
OVERLOADABLE void mem_fence(cl_mem_fence_flags flags);
OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags);
OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags);
-
+#define work_group_barrier barrier
+cl_mem_fence_flags get_fence(void *ptr);
#endif /* __OCL_SYNC_H__ */
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
index eb4c3b4..327624b 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -20,6 +20,11 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define DISABLE_ATOMIC_INT64
+#ifndef DISABLE_ATOMIC_INT64
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#endif
#include "ocl_defines.h"
#define NULL 0
@@ -32,8 +37,6 @@
#define PURE __attribute__((pure))
#define CONST __attribute__((const))
#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
-// FIXME, clang's opencl FE doesn't support static.
-#define static
/////////////////////////////////////////////////////////////////////////////
// OpenCL built-in scalar data types
@@ -44,8 +47,12 @@ typedef unsigned int uint;
typedef unsigned long ulong;
typedef __typeof__(sizeof(int)) size_t;
typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
-typedef signed int intptr_t;
-typedef unsigned int uintptr_t;
+#define __int_t_type(a,b,c) a##b##c
+#define __int_type(type,n) __int_t_type(type,n,_TYPE__)
+typedef __int_type(__INT,__INTPTR_WIDTH__) intptr_t;
+typedef __int_type(__UINT,__INTPTR_WIDTH__) uintptr_t;
+#undef __int_type
+#undef __int_t_type
/////////////////////////////////////////////////////////////////////////////
// OpenCL address space
@@ -84,6 +91,34 @@ DEF(half);
#undef DEF
/////////////////////////////////////////////////////////////////////////////
+// OpenCL atomic related types
+/////////////////////////////////////////////////////////////////////////////
+//atomic flags
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+#define CLK_IMAGE_MEM_FENCE (1 << 2)
+
+typedef uint cl_mem_fence_flags;
+
+//memory order
+typedef enum {
+ memory_order_relaxed,
+ memory_order_acquire,
+ memory_order_release,
+ memory_order_acq_rel,
+ memory_order_seq_cst
+} memory_order;
+
+//memory scope
+typedef enum {
+ memory_scope_work_item,
+ memory_scope_work_group,
+ memory_scope_device,
+ memory_scope_all_svm_devices,
+ memory_scope_sub_group,
+} memory_scope;
+
+/////////////////////////////////////////////////////////////////////////////
// OpenCL built-in event types
/////////////////////////////////////////////////////////////////////////////
// FIXME:
diff --git a/backend/src/libocl/include/ocl_vload_20.h b/backend/src/libocl/include/ocl_vload_20.h
new file mode 100644
index 0000000..3f7fc62
--- /dev/null
+++ b/backend/src/libocl/include/ocl_vload_20.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_VLOAD_20_H__
+#define __OCL_VLOAD_20_H__
+
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p);
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+ DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __generic)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p);
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p);
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+ DECL_BYTE_RD_SPACE(TYPE, __generic) \
+ DECL_BYTE_WR_SPACE(TYPE, __generic) \
+ DECL_BYTE_RD_SPACE(TYPE, __constant)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_BYTE_RW_ALL(half)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p); \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p); \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p); \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p); \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p); \
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p); \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p); \
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p); \
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p);
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, , dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, dummy) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, dummy) \
+
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__generic)
+
+DECL_HALF_ST_SPACE(__generic)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
+
+#endif /* __OCL_VLOAD_20_H__ */
diff --git a/backend/src/libocl/include/ocl_workitem.h b/backend/src/libocl/include/ocl_workitem.h
index c3b0bdb..1a96aa8 100644
--- a/backend/src/libocl/include/ocl_workitem.h
+++ b/backend/src/libocl/include/ocl_workitem.h
@@ -21,15 +21,15 @@
#include "ocl_types.h"
OVERLOADABLE uint get_work_dim(void);
-OVERLOADABLE uint get_global_size(uint dimindx);
-OVERLOADABLE uint get_global_id(uint dimindx);
-OVERLOADABLE uint get_local_size(uint dimindx);
-OVERLOADABLE uint get_enqueued_local_size(uint dimindx);
-OVERLOADABLE uint get_local_id(uint dimindx);
-OVERLOADABLE uint get_num_groups(uint dimindx);
-OVERLOADABLE uint get_group_id(uint dimindx);
-OVERLOADABLE uint get_global_offset(uint dimindx);
-OVERLOADABLE uint get_global_linear_id(void);
-OVERLOADABLE uint get_local_linear_id(void);
+OVERLOADABLE size_t get_global_size(uint dimindx);
+OVERLOADABLE size_t get_global_id(uint dimindx);
+OVERLOADABLE size_t get_local_size(uint dimindx);
+OVERLOADABLE size_t get_enqueued_local_size(uint dimindx);
+OVERLOADABLE size_t get_local_id(uint dimindx);
+OVERLOADABLE size_t get_num_groups(uint dimindx);
+OVERLOADABLE size_t get_group_id(uint dimindx);
+OVERLOADABLE size_t get_global_offset(uint dimindx);
+OVERLOADABLE size_t get_global_linear_id(void);
+OVERLOADABLE size_t get_local_linear_id(void);
#endif /* __OCL_WORKITEM_H__ */
diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py
index 10e8634..1bc8e59 100755
--- a/backend/src/libocl/script/gen_vector.py
+++ b/backend/src/libocl/script/gen_vector.py
@@ -114,7 +114,7 @@ def _prefix(prefix, dtype):
return prefix + '_' + dtype
return dtype
-memspaces = ["__local ", "__private ", "__global "]
+memspaces = ["__local ", "__private ", "__global ", "__generic "]
def stripMemSpace(t):
if t[0:2] == '__':
@@ -254,7 +254,8 @@ class builtinProto():
tmpType = line[i]
if tmpType == '__local' or \
tmpType == '__private' or \
- tmpType == '__global':
+ tmpType == '__global' or\
+ tmpType == '__generic':
memSpace = tmpType + ' '
stripped += 1
continue
diff --git a/backend/src/libocl/script/ocl_integer.def b/backend/src/libocl/script/ocl_integer.def
index c35c242..5e41c34 100644
--- a/backend/src/libocl/script/ocl_integer.def
+++ b/backend/src/libocl/script/ocl_integer.def
@@ -7,6 +7,7 @@ gentype rhadd (gentype x, gentype y)
gentype clamp (gentype x, gentype minval, gentype maxval)
gentype clamp (gentype x, sgentype minval, sgentype maxval)
gentype clz (gentype x)
+gentype ctz (gentype x)
gentype mad_hi (gentype a, gentype b, gentype c)
gentype mad_sat (gentype a, gentype b, gentype c)
gentype max (gentype x, gentype y)
diff --git a/backend/src/libocl/script/ocl_math_20.def b/backend/src/libocl/script/ocl_math_20.def
new file mode 100644
index 0000000..7392108
--- /dev/null
+++ b/backend/src/libocl/script/ocl_math_20.def
@@ -0,0 +1,151 @@
+##math
+gentype acos (gentype)
+gentype acosh (gentype)
+gentype acospi (gentype x)
+gentype asin (gentype)
+gentype asinh (gentype)
+gentype asinpi (gentype x)
+gentype atan (gentype y_over_x)
+gentype atan2 (gentype y, gentype x)
+gentype atanh (gentype)
+gentype atanpi (gentype x)
+gentype atan2pi (gentype y, gentype x)
+gentype cbrt (gentype)
+gentype ceil (gentype)
+gentype copysign (gentype x, gentype y)
+gentype cos (gentype)
+gentype cosh (gentype)
+gentype cospi (gentype x)
+gentype erfc (gentype)
+gentype erf (gentype)
+gentype exp (gentype x)
+gentype exp2 (gentype)
+gentype exp10 (gentype)
+gentype expm1 (gentype x)
+gentype fabs (gentype)
+gentype fdim (gentype x, gentype y)
+gentype floor (gentype)
+# XXX we use madd for fma
+gentype fma (gentype a, gentype b, gentype c)
+gentype fmax (gentype x, gentype y)
+gentypef fmax (gentypef x, float y)
+gentypeh fmax (gentypeh x, half y)
+gentyped fmax (gentyped x, double y)
+gentype fmin (gentype x, gentype y)
+gentypef fmin (gentypef x, float y)
+gentypeh fmin (gentypeh x, half y)
+gentyped fmin (gentyped x, double y)
+gentype fmod (gentype x, gentype y)
+gentype fract (gentype x, __generic gentype *iptr)
+floatn frexp (floatn x, __generic intn *exp)
+float frexp (float x, __generic int *exp)
+halfn frexp (halfn x, __generic intn *exp)
+half frexp (half x, __generic int *exp)
+doublen frexp (doublen x, __generic intn *exp)
+double frexp (double x, __generic int *exp)
+gentype hypot (gentype x, gentype y)
+intn ilogb (floatn x)
+int ilogb (float x)
+shortn ilogb (halfn x)
+short ilogb (half x)
+intn ilogb (doublen x)
+int ilogb (double x)
+floatn ldexp (floatn x, intn k)
+floatn ldexp (floatn x, int k)
+float ldexp (float x, int k)
+halfn ldexp (halfn x, intn k)
+halfn ldexp (halfn x, int k)
+half ldexp (half x, int k)
+doublen ldexp (doublen x, intn k)
+doublen ldexp (doublen x, int k)
+double ldexp (double x, int k)
+gentype lgamma (gentype x)
+floatn lgamma_r (floatn x, __generic intn *signp)
+float lgamma_r (float x, __generic int *signp)
+halfn lgamma_r (halfn x, __generic intn *signp)
+half lgamma_r (half x, __generic int *signp)
+#doublen lgamma_r (doublen x, __generic intn *signp)
+#double lgamma_r (double x, __generic int *signp)
+gentype log (gentype)
+gentype log2 (gentype)
+gentype log10 (gentype)
+gentype log1p (gentype x)
+gentype logb (gentype x)
+gentype mad (gentype a, gentype b, gentype c)
+gentype maxmag (gentype x, gentype y)
+gentype minmag (gentype x, gentype y)
+gentype modf (gentype x, __generic gentype *iptr)
+floatn nan (uintn nancode)
+float nan (uint nancode)
+halfn nan (ushortn nancode)
+half nan (ushort nancode)
+doublen nan (ulongn nancode)
+double nan (ulong nancode)
+gentype nextafter (gentype x, gentype y)
+gentype pow (gentype x, gentype y)
+floatn pown (floatn x, intn y)
+float pown (float x, int y)
+halfn pown (halfn x, intn y)
+half pown (half x, int y)
+doublen pown (doublen x, intn y)
+double pown (double x, int y)
+gentype powr (gentype x, gentype y)
+gentype remainder (gentype x, gentype y)
+floatn remquo (floatn x, floatn y, __generic intn *quo)
+float remquo (float x, float y, __generic int *quo)
+halfn remquo (halfn x, halfn y, __generic intn *quo)
+half remquo (half x, half y, __generic int *quo)
+doublen remquo (doublen x, doublen y, __generic intn *quo)
+double remquo (double x, double y, __generic int *quo)
+gentype rint (gentype)
+floatn rootn (floatn x, intn y)
+halfn rootn (halfn x, intn y)
+doublen rootn (doublen x, intn y)
+doublen rootn (double x, int y)
+gentype round (gentype x)
+gentype rsqrt (gentype)
+gentype sin (gentype)
+gentype sincos (gentype x, __generic gentype *cosval)
+gentype sinh (gentype)
+gentype sinpi (gentype x)
+gentype sqrt (gentype)
+gentype tan (gentype)
+gentype tanh (gentype)
+gentype tanpi (gentype x)
+gentype tgamma (gentype)
+gentype trunc (gentype)
+
+
+# XXX we already defined all native and non-native
+# functions to the same one.
+gentype native_cos (gentype x)
+gentype native_divide (gentype x, gentype y)
+gentype native_exp (gentype x)
+gentype native_exp2 (gentype x)
+gentype native_exp10 (gentype x)
+gentype native_log (gentype x)
+gentype native_log2 (gentype x)
+gentype native_log10 (gentype x)
+gentype native_powr (gentype x, gentype y)
+gentype native_recip (gentype x)
+gentype native_rsqrt (gentype x)
+gentype native_sin (gentype x)
+gentype native_sqrt (gentype x)
+gentype native_tan (gentype x)
+
+
+##half_native_math
+gentype half_cos (gentype x)
+gentype half_divide (gentype x, gentype y)
+gentype half_exp (gentype x)
+gentype half_exp2 (gentype x)
+gentype half_exp10 (gentype x)
+gentype half_log (gentype x)
+gentype half_log2 (gentype x)
+gentype half_log10 (gentype x)
+gentype half_powr (gentype x, gentype y)
+gentype half_recip (gentype x)
+gentype half_rsqrt (gentype x)
+gentype half_sin (gentype x)
+gentype half_sqrt (gentype x)
+gentype half_tan (gentype x)
diff --git a/backend/src/libocl/src/ocl_atom_20.cl b/backend/src/libocl/src/ocl_atom_20.cl
new file mode 100644
index 0000000..ca200bc
--- /dev/null
+++ b/backend/src/libocl/src/ocl_atom_20.cl
@@ -0,0 +1,381 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_atom_20.h"
+#include "ocl_as.h"
+#include "ocl_sync.h"
+
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX) \
+ OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+ return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val); \
+ }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
+
+#define DECL_ATOMIC_OP(NAME) \
+ DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_) \
+ DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+
+DECL_ATOMIC_OP(add)
+DECL_ATOMIC_OP(sub)
+DECL_ATOMIC_OP(and)
+DECL_ATOMIC_OP(or)
+DECL_ATOMIC_OP(xor)
+DECL_ATOMIC_OP(xchg)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX) \
+ OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+ return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+ }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+ OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+ return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+ }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+ DECL_ATOMIC_OP_TYPE(NAME, uint) \
+ DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(inc)
+DECL_ATOMIC_OP(dec)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+ OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+ return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+ }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+ DECL_ATOMIC_OP_TYPE(NAME, uint) \
+ DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(cmpxchg)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
+// OpenCL 2.0 features.
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val) { \
+ CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \
+ bool ret = oldValue == *expected; \
+ *expected = oldValue; \
+ return ret; \
+ }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, memory_order_seq_cst, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+ __gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_OP(NAME, PREFIX) \
+ DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ //DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_OP(NAME, PREFIX) \
+ DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ //DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_LOAD_OP(NAME, PREFIX) \
+ DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ //DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_NO_RET_OP(NAME, PREFIX) \
+ DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ //DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+DECL_ATOMIC_OP(exchange, exchange)
+DECL_ATOMIC_OP(fetch_add, fetch_add)
+DECL_ATOMIC_OP(fetch_sub, fetch_sub)
+DECL_ATOMIC_OP(fetch_and, fetch_and)
+DECL_ATOMIC_OP(fetch_or, fetch_or)
+DECL_ATOMIC_OP(fetch_xor, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load, fetch_add)
+DECL_ATOMIC_NO_RET_OP(init, exchange)
+DECL_ATOMIC_NO_RET_OP(store, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax32, atomic_uint, atomic_int, uint)
+#ifndef DISABLE_ATOMIC_INT64
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax64, atomic_ulong, atomic_long, ulong)
+#endif
+DECL_ATOMIC_OP_TYPE(exchange, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(init, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(store, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_LOAD_TYPE(load, fetch_addf, atomic_float, atomic_int, float)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+
+// with memory_order.
+
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure) { \
+ CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, memory_scope_device); \
+ bool ret = oldValue == *expected; \
+ *expected = oldValue; \
+ return ret; \
+ }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+ __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device); \
+ }
+
+DECL_ATOMIC_OP(exchange_explicit, exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit, fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+#ifndef DISABLE_ATOMIC_INT64
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+#endif
+DECL_ATOMIC_OP_TYPE(exchange_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(init_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(store_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_LOAD_TYPE(load_explicit, fetch_addf, atomic_float, atomic_int, float)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+
+// with memory_order and memory_scope
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope); \
+ }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure, memory_scope scope) { \
+ CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, scope); \
+ bool ret = oldValue == *expected; \
+ *expected = oldValue; \
+ return ret; \
+ }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order, memory_scope scope) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, scope); \
+ }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+ __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope); \
+ }
+
+DECL_ATOMIC_OP(exchange_explicit, exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit, fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+#ifndef DISABLE_ATOMIC_INT64
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+#endif
+DECL_ATOMIC_OP_TYPE(exchange_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(init_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(store_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_LOAD_TYPE(load_explicit, fetch_addf, atomic_float, atomic_int, float)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_LOAD_OP
+#undef DECL_ATOMIC_NO_RET_OP
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_OP
+
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object) {
+ atomic_int * temp = (atomic_int*)object;
+ int expected = 0;
+ int new_value = 1;
+ int oldValue = __gen_ocl_atomic_compare_exchange_strong32(temp, expected, new_value, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+ if(oldValue == new_value)
+ return true;
+ else
+ return false;
+}
+
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order) {
+ atomic_int * temp = (atomic_int*)object;
+ int expected = 0;
+ int new_value = 1;
+ int oldValue = __gen_ocl_atomic_compare_exchange_strong32(temp, expected, new_value, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+ if(oldValue == new_value)
+ return true;
+ else
+ return false;
+}
+
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+ atomic_int * temp = (atomic_int*)object;
+ int expected = 0;
+ int new_value = 1;
+ int oldValue = __gen_ocl_atomic_compare_exchange_strong32(temp, expected, new_value, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+ if(oldValue == new_value)
+ return true;
+ else
+ return false;
+}
+
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object){
+ atomic_int * temp = (atomic_int*)object;
+ __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order){
+ atomic_int * temp = (atomic_int*)object;
+ __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+ atomic_int * temp = (atomic_int*)object;
+ __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope){
+}
diff --git a/backend/src/libocl/src/ocl_atomic_20.ll b/backend/src/libocl/src/ocl_atomic_20.ll
new file mode 100644
index 0000000..38efac0
--- /dev/null
+++ b/backend/src/libocl/src/ocl_atomic_20.ll
@@ -0,0 +1,165 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+;32bit version.
+define i32 @__gen_ocl_atomic_exchange32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_exchangef(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_add32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_addf(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_sub32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_or32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_xor32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_and32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_imin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_imax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_umin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_umax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_compare_exchange_strong32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %success, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+ %1 = extractvalue { i32, i1 } %0, 0
+ ret i32 %1
+}
+
+define i32 @__gen_ocl_atomic_compare_exchange_weak32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg weak volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+ %1 = extractvalue { i32, i1 } %0, 0
+ ret i32 %1
+}
+
+;64bit version
+
+define i64 @__gen_ocl_atomic_exchange64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_add64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_sub64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_or64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_xor64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_and64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_imin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_imax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_umin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_umax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_compare_exchange_strong64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+ %1 = extractvalue { i64, i1 } %0, 0
+ ret i64 %1
+}
+
+define i64 @__gen_ocl_atomic_compare_exchange_weak64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg weak volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+ %1 = extractvalue { i64, i1 } %0, 0
+ ret i64 %1
+}
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index 9416f80..502ee67 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -11,34 +11,11 @@ declare i32 @_get_local_mem_fence() nounwind alwaysinline
declare i32 @_get_global_mem_fence() nounwind alwaysinline
declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
-declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
- %1 = icmp eq i32 %flags, 3
- br i1 %1, label %barrier_local_global, label %barrier_local_check
-
-barrier_local_global:
- call void @__gen_ocl_barrier_local_and_global()
- br label %done
-
-barrier_local_check:
- %2 = icmp eq i32 %flags, 1
- br i1 %2, label %barrier_local, label %barrier_global_check
-
-barrier_local:
- call void @__gen_ocl_barrier_local()
- br label %done
-
-barrier_global_check:
- %3 = icmp eq i32 %flags, 2
- br i1 %3, label %barrier_global, label %done
-
-barrier_global:
- call void @__gen_ocl_barrier_global()
- br label %done
-
-done:
+ call void @__gen_ocl_barrier(i32 %flags)
ret void
}
diff --git a/backend/src/libocl/src/ocl_barrier_20.ll b/backend/src/libocl/src/ocl_barrier_20.ll
new file mode 100644
index 0000000..8935076
--- /dev/null
+++ b/backend/src/libocl/src/ocl_barrier_20.ll
@@ -0,0 +1,25 @@
+;XXX FIXME as llvm can't use macros, we hardcoded 3, 1, 2
+;here, we may need to use a more grace way to handle this type
+;of values latter.
+;#define CLK_LOCAL_MEM_FENCE (1 << 0)
+;#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+declare i32 @_get_local_mem_fence() nounwind alwaysinline
+declare i32 @_get_global_mem_fence() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
+
+define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
+ call void @__gen_ocl_barrier(i32 %flags)
+ ret void
+}
+
+define void @_Z9debugwaitv() nounwind noduplicate alwaysinline {
+ call void @__gen_ocl_debugwait()
+ ret void
+}
diff --git a/backend/src/libocl/src/ocl_clz_20.ll b/backend/src/libocl/src/ocl_clz_20.ll
new file mode 100644
index 0000000..19f4e35
--- /dev/null
+++ b/backend/src/libocl/src/ocl_clz_20.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call2, 32
+ %4 = add i32 %call1, 32
+ %5 = select i1 %cmp, i32 %call2, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
+
+define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call2, 32
+ %4 = add i32 %call1, 32
+ %5 = select i1 %cmp, i32 %call2, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
diff --git a/backend/src/libocl/src/ocl_ctz.ll b/backend/src/libocl/src/ocl_ctz.ll
new file mode 100644
index 0000000..f30bd0a
--- /dev/null
+++ b/backend/src/libocl/src/ocl_ctz.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i8 @ctz_s8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i8 @ctz_u8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i16 @ctz_s16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i16 @ctz_u16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i32 @ctz_s32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i32 @ctz_u32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i64 @ctz_s64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call1, 32
+ %4 = add i32 %call2, 32
+ %5 = select i1 %cmp, i32 %call1, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
+
+define i64 @ctz_u64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call1, 32
+ %4 = add i32 %call2, 32
+ %5 = select i1 %cmp, i32 %call1, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
diff --git a/backend/src/libocl/src/ocl_ctz_20.ll b/backend/src/libocl/src/ocl_ctz_20.ll
new file mode 100644
index 0000000..0a79b26
--- /dev/null
+++ b/backend/src/libocl/src/ocl_ctz_20.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i8 @ctz_s8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i8 @ctz_u8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i16 @ctz_s16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i16 @ctz_u16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i32 @ctz_s32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i32 @ctz_u32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i64 @ctz_s64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call1, 32
+ %4 = add i32 %call2, 32
+ %5 = select i1 %cmp, i32 %call1, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
+
+define i64 @ctz_u64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call1, 32
+ %4 = add i32 %call2, 32
+ %5 = select i1 %cmp, i32 %call1, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
diff --git a/backend/src/libocl/src/ocl_enqueue.cl b/backend/src/libocl/src/ocl_enqueue.cl
new file mode 100644
index 0000000..dc8fa3b
--- /dev/null
+++ b/backend/src/libocl/src/ocl_enqueue.cl
@@ -0,0 +1,238 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_types.h"
+#include "ocl_enqueue.h"
+#include "ocl_workitem.h"
+#include "ocl_atom.h"
+
+queue_t get_default_queue(void)
+{
+ queue_t queue;
+ return queue; //return NULL queue
+}
+
+ndrange_t __gen_ocl_set_ndrange_info(__private struct ndrange_info_t *info);
+__private struct ndrange_info_t* __gen_ocl_get_ndrange_info(ndrange_t info);
+__global int* __gen_ocl_get_enqueue_info_addr(void);
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void))
+{
+ int i;
+ __private struct Block_literal *literal = (__private struct Block_literal *)block;
+ __private uchar *data = (__private uchar *)block;
+ int size = literal->descriptor->size;
+ literal->descriptor->reserved = 0;
+ __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+ int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t));
+ __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+ __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+
+ *((__global struct ndrange_info_t *)addr) = *info;
+ addr += sizeof(*info);
+
+ for(i=0; i< size; i++) {
+ addr[i] = data[i];
+ }
+ return 0;
+}
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+ uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+ clk_event_t *event_ret, void (^block)(void))
+{
+ return enqueue_kernel(q, flag, ndrange, block);
+}
+
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes)
+{
+ int i;
+ __private struct Block_literal* literal = (__private struct Block_literal *)block;
+ __private uchar* data = (__private uchar *)block;
+ int size = literal->descriptor->size;
+ int slm_size = count * sizeof(int);
+ literal->descriptor->reserved = slm_size;
+ __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+ int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t) + slm_size);
+ __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+ __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+
+ *((__global struct ndrange_info_t *)addr) = *info;
+ addr += sizeof(*info);
+
+ for(i=0; i < size; i++) {
+ addr[i] = data[i];
+ }
+
+ addr += size;
+ for(i=0; i < count; i++) {
+ ((__global int *)addr)[i] = slm_sizes[i];
+ }
+ return 0;
+}
+
+clk_event_t create_user_event(void)
+{
+ clk_event_t e;
+ return e;
+}
+
+void retain_event(clk_event_t event)
+{
+ return;
+}
+
+void release_event(clk_event_t event)
+{
+ return;
+}
+
+void set_user_event_status(clk_event_t event, int status)
+{
+ return;
+}
+
+bool is_valid_event(clk_event_t event)
+{
+ return 1;
+}
+
+uint __get_kernel_work_group_size_impl(__private void *block)
+{
+ return 256;
+}
+
+uint __get_kernel_preferred_work_group_multiple_impl(__private void *block)
+{
+ return 16;
+}
+
+void capture_event_profiling_info(clk_event_t event, int name, global void *value)
+{
+ //fake profiing data
+ ((__global ulong *)value)[0] = 0x3000;
+ ((__global ulong *)value)[1] = 0x6000;
+}
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size)
+{
+ struct ndrange_info_t info;
+ info.type = 0x1;
+ info.global_work_size[0] = global_work_size;
+ return __gen_ocl_set_ndrange_info(&info);
+ //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size)
+{
+ struct ndrange_info_t info;
+ info.type = 0x2;
+ info.global_work_size[0] = global_work_size;
+ info.local_work_size[0] = local_work_size;
+ return __gen_ocl_set_ndrange_info(&info);
+ // return ndrange;
+}
+
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size)
+{
+ struct ndrange_info_t info;
+ info.type = 0x3;
+ info.global_work_size[0] = global_work_size;
+ info.local_work_size[0] = local_work_size;
+ info.global_work_offset[0] = global_work_offset;
+ return __gen_ocl_set_ndrange_info(&info);
+ //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
+{
+ struct ndrange_info_t info;
+ info.type = 0x11;
+ info.global_work_size[0] = global_work_size[0];
+ info.global_work_size[1] = global_work_size[1];
+ return __gen_ocl_set_ndrange_info(&info);
+ //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2])
+{
+ struct ndrange_info_t info;
+ info.type = 0x12;
+ info.global_work_size[0] = global_work_size[0];
+ info.global_work_size[1] = global_work_size[1];
+ info.local_work_size[0] = local_work_size[0];
+ info.local_work_size[1] = local_work_size[1];
+ return __gen_ocl_set_ndrange_info(&info);
+}
+
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2])
+{
+ struct ndrange_info_t info;
+ info.type = 0x13;
+ info.global_work_size[0] = global_work_size[0];
+ info.global_work_size[1] = global_work_size[1];
+ info.local_work_size[0] = local_work_size[0];
+ info.local_work_size[1] = local_work_size[1];
+ info.global_work_offset[0] = global_work_offset[0];
+ info.global_work_offset[1] = global_work_offset[1];
+ return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
+{
+ struct ndrange_info_t info;
+ info.type = 0x21;
+ info.global_work_size[0] = global_work_size[0];
+ info.global_work_size[1] = global_work_size[1];
+ info.global_work_size[2] = global_work_size[2];
+ return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3])
+{
+ struct ndrange_info_t info;
+ info.type = 0x22;
+ info.global_work_size[0] = global_work_size[0];
+ info.global_work_size[1] = global_work_size[1];
+ info.global_work_size[2] = global_work_size[2];
+ info.local_work_size[0] = local_work_size[0];
+ info.local_work_size[1] = local_work_size[1];
+ info.local_work_size[2] = local_work_size[2];
+ return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3])
+{
+ struct ndrange_info_t info;
+ info.type = 0x23;
+ info.global_work_size[0] = global_work_size[0];
+ info.global_work_size[1] = global_work_size[1];
+ info.global_work_size[2] = global_work_size[2];
+ info.local_work_size[0] = local_work_size[0];
+ info.local_work_size[1] = local_work_size[1];
+ info.local_work_size[2] = local_work_size[2];
+ info.global_work_offset[0] = global_work_offset[0];
+ info.global_work_offset[1] = global_work_offset[1];
+ info.global_work_offset[2] = global_work_offset[2];
+ return __gen_ocl_set_ndrange_info(&info);
+}
+
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret)
+{
+ return 0;
+}
diff --git a/backend/src/libocl/src/ocl_geometric.cl b/backend/src/libocl/src/ocl_geometric.cl
index cf98503..af39ed3 100644
--- a/backend/src/libocl/src/ocl_geometric.cl
+++ b/backend/src/libocl/src/ocl_geometric.cl
@@ -18,7 +18,11 @@
#include "ocl_geometric.h"
#include "ocl_common.h"
#include "ocl_relational.h"
+#if (__OPENCL_C_VERSION__ >= 200)
+#include "ocl_math_20.h"
+#else
#include "ocl_math.h"
+#endif
#include "ocl_float.h"
CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
diff --git a/backend/src/libocl/src/ocl_image.cl b/backend/src/libocl/src/ocl_image.cl
index a1125a8..2febfda 100644
--- a/backend/src/libocl/src/ocl_image.cl
+++ b/backend/src/libocl/src/ocl_image.cl
@@ -16,7 +16,11 @@
*
*/
#include "ocl_image.h"
+#if (__OPENCL_C_VERSION__ >= 200)
+#include "ocl_math_20.h"
+#else
#include "ocl_math.h"
+#endif
#include "ocl_integer.h"
#include "ocl_common.h"
#include "ocl_convert.h"
@@ -77,6 +81,42 @@ DECL_GEN_OCL_QUERY_IMAGE(write_only image2d_t)
DECL_GEN_OCL_QUERY_IMAGE(write_only image2d_array_t)
DECL_GEN_OCL_QUERY_IMAGE(write_only image3d_t)
#endif
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_GEN_OCL_RW_IMAGE_WR(image_type, n) \
+ OVERLOADABLE int4 __gen_ocl_read_imagei(read_write image_type image, sampler_t sampler, \
+ float ##n coord, uint sampler_offset); \
+ OVERLOADABLE int4 __gen_ocl_read_imagei(read_write image_type image, sampler_t sampler, \
+ int ##n coord, uint sampler_offset); \
+ OVERLOADABLE uint4 __gen_ocl_read_imageui(read_write image_type image, sampler_t sampler, \
+ float ##n coord, uint sampler_offset); \
+ OVERLOADABLE uint4 __gen_ocl_read_imageui(read_write image_type image, sampler_t sampler, \
+ int ##n coord, uint sampler_offset); \
+ OVERLOADABLE float4 __gen_ocl_read_imagef(read_write image_type image, sampler_t sampler, \
+ float ##n coord, uint sampler_offset); \
+ OVERLOADABLE float4 __gen_ocl_read_imagef(read_write image_type image, sampler_t sampler, \
+ int ##n coord, uint sampler_offset); \
+ OVERLOADABLE void __gen_ocl_write_imagei(read_write image_type image, int ##n coord , int4 color); \
+ OVERLOADABLE void __gen_ocl_write_imageui(read_write image_type image, int ##n coord, uint4 color);\
+ OVERLOADABLE void __gen_ocl_write_imagef(read_write image_type image, int ##n coord, float4 color);
+
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_t, 1)
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_buffer_t, 2)
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_array_t, 2)
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_array_t, 4)
+DECL_GEN_OCL_RW_IMAGE_WR(image2d_t, 2)
+DECL_GEN_OCL_RW_IMAGE_WR(image2d_array_t, 3)
+DECL_GEN_OCL_RW_IMAGE_WR(image3d_t, 3)
+DECL_GEN_OCL_RW_IMAGE_WR(image2d_array_t, 4)
+DECL_GEN_OCL_RW_IMAGE_WR(image3d_t, 4)
+
+DECL_GEN_OCL_QUERY_IMAGE(read_write image1d_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image1d_buffer_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image1d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image2d_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image2d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image3d_t)
+#endif
///////////////////////////////////////////////////////////////////////////////
// helper functions to validate array index.
///////////////////////////////////////////////////////////////////////////////
@@ -160,6 +200,51 @@ INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, write_only image
return coord;
}
#endif
+
+#if (__OPENCL_C_VERSION__ >= 200)
+INLINE_OVERLOADABLE float2 __gen_validate_array_index(float2 coord, read_write image1d_array_t image)
+{
+ float array_size = __gen_ocl_get_image_depth(image);
+ coord.s1 = clamp(rint(coord.s1), 0.f, array_size - 1.f);
+ return coord;
+}
+
+INLINE_OVERLOADABLE float4 __gen_validate_array_index(float4 coord, read_write image2d_array_t image)
+{
+ float array_size = __gen_ocl_get_image_depth(image);
+ coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+ return coord;
+}
+
+INLINE_OVERLOADABLE float3 __gen_validate_array_index(float3 coord, read_write image2d_array_t image)
+{
+ float array_size = __gen_ocl_get_image_depth(image);
+ coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+ return coord;
+}
+
+INLINE_OVERLOADABLE int2 __gen_validate_array_index(int2 coord, read_write image1d_array_t image)
+{
+ int array_size = __gen_ocl_get_image_depth(image);
+ coord.s1 = clamp(coord.s1, 0, array_size - 1);
+ return coord;
+}
+
+INLINE_OVERLOADABLE int4 __gen_validate_array_index(int4 coord, read_write image2d_array_t image)
+{
+ int array_size = __gen_ocl_get_image_depth(image);
+ coord.s2 = clamp(coord.s2, 0, array_size - 1);
+ return coord;
+}
+
+INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, read_write image2d_array_t image)
+{
+ int array_size = __gen_ocl_get_image_depth(image);
+ coord.s2 = clamp(coord.s2, 0, array_size - 1);
+ return coord;
+}
+#endif
+
// For non array image type, we need to do nothing.
#define GEN_VALIDATE_ARRAY_INDEX(coord_type, image_type) \
INLINE_OVERLOADABLE coord_type __gen_validate_array_index(coord_type coord, image_type image) \
@@ -190,6 +275,19 @@ GEN_VALIDATE_ARRAY_INDEX(int3, write_only image3d_t)
GEN_VALIDATE_ARRAY_INDEX(float, write_only image1d_buffer_t)
GEN_VALIDATE_ARRAY_INDEX(int, write_only image1d_buffer_t)
#endif
+
+#if (__OPENCL_C_VERSION__ >= 200)
+GEN_VALIDATE_ARRAY_INDEX(float, read_write image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(int, read_write image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(float2, read_write image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(int2, read_write image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(float4, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int4, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float3, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int3, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float, read_write image1d_buffer_t)
+GEN_VALIDATE_ARRAY_INDEX(int, read_write image1d_buffer_t)
+#endif
///////////////////////////////////////////////////////////////////////////////
// Helper functions to work around some coordiate boundary issues.
// The major issue on Gen7/Gen7.5 are the sample message could not sampling
@@ -390,9 +488,9 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
convert_float ##n (tmpCoord), 0); \
}
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, image_data_type, \
+#define DECL_READ_IMAGE_NOSAMPLER(access_qual, image_type, image_data_type, \
suffix, coord_type, n) \
- OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image, \
+ OVERLOADABLE image_data_type read_image ##suffix(access_qual image_type cl_image, \
coord_type coord) \
{ \
coord = __gen_validate_array_index(coord, cl_image); \
@@ -402,8 +500,8 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
cl_image, defaultSampler, convert_float ##n (coord), 0); \
}
-#define DECL_WRITE_IMAGE(image_type, image_data_type, suffix, coord_type) \
- OVERLOADABLE void write_image ##suffix(write_only image_type cl_image, \
+#define DECL_WRITE_IMAGE(access_qual, image_type, image_data_type, suffix, coord_type) \
+ OVERLOADABLE void write_image ##suffix(access_qual image_type cl_image, \
coord_type coord, \
image_data_type color) \
{ \
@@ -411,13 +509,25 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
__gen_ocl_write_image ##suffix(cl_image, fixedCoord, color); \
}
+#if (__OPENCL_C_VERSION__ >= 200)
#define DECL_IMAGE(int_clamping_fix, image_type, image_data_type, suffix, n) \
DECL_READ_IMAGE0(int_clamping_fix, image_type, \
image_data_type, suffix, int ##n, n) \
DECL_READ_IMAGE1(int_clamping_fix, image_type, \
image_data_type, suffix, float ##n, n) \
- DECL_READ_IMAGE_NOSAMPLER(image_type, image_data_type, suffix, int ##n, n) \
- DECL_WRITE_IMAGE(image_type, image_data_type, suffix, int ##n) \
+ DECL_READ_IMAGE_NOSAMPLER(read_only, image_type, image_data_type, suffix, int ##n, n) \
+ DECL_READ_IMAGE_NOSAMPLER(read_write, image_type, image_data_type, suffix, int ##n, n) \
+ DECL_WRITE_IMAGE(write_only, image_type, image_data_type, suffix, int ##n) \
+ DECL_WRITE_IMAGE(read_write, image_type, image_data_type, suffix, int ##n)
+#else
+#define DECL_IMAGE(int_clamping_fix, image_type, image_data_type, suffix, n) \
+ DECL_READ_IMAGE0(int_clamping_fix, image_type, \
+ image_data_type, suffix, int ##n, n) \
+ DECL_READ_IMAGE1(int_clamping_fix, image_type, \
+ image_data_type, suffix, float ##n, n) \
+ DECL_READ_IMAGE_NOSAMPLER(read_only, image_type, image_data_type, suffix, int ##n, n) \
+ DECL_WRITE_IMAGE(write_only, image_type, image_data_type, suffix, int ##n)
+#endif
// 1D
#define DECL_IMAGE_TYPE(image_type, n) \
@@ -432,9 +542,9 @@ DECL_IMAGE_TYPE(image3d_t, 3)
DECL_IMAGE_TYPE(image2d_array_t, 4)
DECL_IMAGE_TYPE(image2d_array_t, 3)
-#define DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(image_type, image_data_type, \
+#define DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(access_qual, image_type, image_data_type, \
suffix, coord_type) \
- OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image, \
+ OVERLOADABLE image_data_type read_image ##suffix(access_qual image_type cl_image, \
coord_type coord) \
{ \
sampler_t defaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE \
@@ -446,8 +556,8 @@ DECL_IMAGE_TYPE(image2d_array_t, 3)
cl_image, defaultSampler, convert_float2(effectCoord), 0); \
}
-#define DECL_WRITE_IMAGE1D_BUFFER(image_type, image_data_type, suffix, coord_type) \
- OVERLOADABLE void write_image ##suffix(write_only image_type cl_image, \
+#define DECL_WRITE_IMAGE1D_BUFFER(access_qual, image_type, image_data_type, suffix, coord_type) \
+ OVERLOADABLE void write_image ##suffix(access_qual image_type cl_image, \
coord_type coord, \
image_data_type color) \
{ \
@@ -457,11 +567,20 @@ DECL_IMAGE_TYPE(image2d_array_t, 3)
__gen_ocl_write_image ##suffix(cl_image, effectCoord, color); \
}
-
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE_1DBuffer(int_clamping_fix, image_data_type, suffix) \
+ DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(read_only, image1d_buffer_t, image_data_type, \
+ suffix, int) \
+ DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(read_write, image1d_buffer_t, image_data_type, \
+ suffix, int) \
+ DECL_WRITE_IMAGE1D_BUFFER(write_only, image1d_buffer_t, image_data_type, suffix, int) \
+ DECL_WRITE_IMAGE1D_BUFFER(read_write, image1d_buffer_t, image_data_type, suffix, int)
+#else
#define DECL_IMAGE_1DBuffer(int_clamping_fix, image_data_type, suffix) \
- DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(image1d_buffer_t, image_data_type, \
+ DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(read_only, image1d_buffer_t, image_data_type, \
suffix, int) \
- DECL_WRITE_IMAGE1D_BUFFER(image1d_buffer_t, image_data_type, suffix, int)
+ DECL_WRITE_IMAGE1D_BUFFER(write_only, image1d_buffer_t, image_data_type, suffix, int)
+#endif
DECL_IMAGE_1DBuffer(GEN_FIX_INT_CLAMPING, int4, i)
DECL_IMAGE_1DBuffer(GEN_FIX_INT_CLAMPING, uint4, ui)
@@ -535,12 +654,23 @@ INLINE_OVERLOADABLE int4 __gen_fixup_1darray_coord(int2 coord, image1d_array_t i
convert_float2 (tmpCoord), 0); \
}
+#if (__OPENCL_C_VERSION__ >= 200)
#define DECL_IMAGE_1DArray(int_clamping_fix, image_data_type, suffix) \
DECL_READ_IMAGE0_1DArray(int_clamping_fix, image_data_type, suffix, int2) \
DECL_READ_IMAGE1_1DArray(int_clamping_fix, image_data_type, \
suffix, float2) \
- DECL_READ_IMAGE_NOSAMPLER(image1d_array_t, image_data_type, suffix, int2, 2)\
- DECL_WRITE_IMAGE(image1d_array_t, image_data_type, suffix, int2) \
+ DECL_READ_IMAGE_NOSAMPLER(read_only, image1d_array_t, image_data_type, suffix, int2, 2) \
+ DECL_READ_IMAGE_NOSAMPLER(read_write, image1d_array_t, image_data_type, suffix, int2, 2)\
+ DECL_WRITE_IMAGE(write_only, image1d_array_t, image_data_type, suffix, int2) \
+ DECL_WRITE_IMAGE(read_write, image1d_array_t, image_data_type, suffix, int2)
+#else
+#define DECL_IMAGE_1DArray(int_clamping_fix, image_data_type, suffix) \
+ DECL_READ_IMAGE0_1DArray(int_clamping_fix, image_data_type, suffix, int2) \
+ DECL_READ_IMAGE1_1DArray(int_clamping_fix, image_data_type, \
+ suffix, float2) \
+ DECL_READ_IMAGE_NOSAMPLER(read_only, image1d_array_t, image_data_type, suffix, int2, 2) \
+ DECL_WRITE_IMAGE(write_only, image1d_array_t, image_data_type, suffix, int2)
+#endif
DECL_IMAGE_1DArray(GEN_FIX_INT_CLAMPING, int4, i)
DECL_IMAGE_1DArray(GEN_FIX_INT_CLAMPING, uint4, ui)
@@ -579,6 +709,15 @@ DECL_IMAGE_INFO_COMMON(write_only image3d_t)
DECL_IMAGE_INFO_COMMON(write_only image2d_array_t)
#endif
+#if (__OPENCL_C_VERSION__ >= 200)
+DECL_IMAGE_INFO_COMMON(read_write image1d_t)
+DECL_IMAGE_INFO_COMMON(read_write image1d_buffer_t)
+DECL_IMAGE_INFO_COMMON(read_write image1d_array_t)
+DECL_IMAGE_INFO_COMMON(read_write image2d_t)
+DECL_IMAGE_INFO_COMMON(read_write image3d_t)
+DECL_IMAGE_INFO_COMMON(read_write image2d_array_t)
+#endif
+
// 2D extra Info
OVERLOADABLE int get_image_height(read_only image2d_t image)
{
@@ -598,6 +737,17 @@ OVERLOADABLE int2 get_image_dim(write_only image2d_t image)
return (int2){get_image_width(image), get_image_height(image)};
}
#endif
+
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_height(read_write image2d_t image)
+{
+ return __gen_ocl_get_image_height(image);
+}
+OVERLOADABLE int2 get_image_dim(read_write image2d_t image)
+{
+ return (int2){get_image_width(image), get_image_height(image)};
+}
+#endif
// End of 2D
// 3D extra Info
@@ -633,6 +783,24 @@ OVERLOADABLE int4 get_image_dim(write_only image3d_t image)
0);
}
#endif
+
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_height(read_write image3d_t image)
+{
+ return __gen_ocl_get_image_height(image);
+}
+OVERLOADABLE int get_image_depth(read_write image3d_t image)
+{
+ return __gen_ocl_get_image_depth(image);
+}
+OVERLOADABLE int4 get_image_dim(read_write image3d_t image)
+{
+ return (int4) (get_image_width(image),
+ get_image_height(image),
+ get_image_depth(image),
+ 0);
+}
+#endif
// 2D Array extra Info
OVERLOADABLE int get_image_height(read_only image2d_array_t image)
{
@@ -660,6 +828,20 @@ OVERLOADABLE size_t get_image_array_size(write_only image2d_array_t image)
return __gen_ocl_get_image_depth(image);
}
#endif
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_height(read_write image2d_array_t image)
+{
+ return __gen_ocl_get_image_height(image);
+}
+OVERLOADABLE int2 get_image_dim(read_write image2d_array_t image)
+{
+ return (int2){get_image_width(image), get_image_height(image)};
+}
+OVERLOADABLE size_t get_image_array_size(read_write image2d_array_t image)
+{
+ return __gen_ocl_get_image_depth(image);
+}
+#endif
// 1D Array info
OVERLOADABLE size_t get_image_array_size(read_only image1d_array_t image)
{
@@ -671,4 +853,10 @@ OVERLOADABLE size_t get_image_array_size(write_only image1d_array_t image)
return __gen_ocl_get_image_depth(image);
}
#endif
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE size_t get_image_array_size(read_write image1d_array_t image)
+{
+ return __gen_ocl_get_image_depth(image);
+}
+#endif
// End of 1DArray
diff --git a/backend/src/libocl/src/ocl_memcpy.cl b/backend/src/libocl/src/ocl_memcpy.cl
index 85f490f..131574d 100644
--- a/backend/src/libocl/src/ocl_memcpy.cl
+++ b/backend/src/libocl/src/ocl_memcpy.cl
@@ -37,13 +37,28 @@ void __gen_memcpy_ ##NAME (DST_SPACE uchar* dst, SRC_SPACE uchar* src, size_t si
} \
}
+#if (__OPENCL_C_VERSION__ >= 200)
#define DECL_ONE_SPACE_MEMCOPY_FN(NAME, DST_SPACE) \
DECL_TWO_SPACE_MEMCOPY_FN( NAME## g, DST_SPACE, __global) \
DECL_TWO_SPACE_MEMCOPY_FN( NAME## l, DST_SPACE, __local) \
DECL_TWO_SPACE_MEMCOPY_FN( NAME## p, DST_SPACE, __private) \
+ DECL_TWO_SPACE_MEMCOPY_FN( NAME## n, DST_SPACE, __generic) \
DECL_TWO_SPACE_MEMCOPY_FN( NAME## c, DST_SPACE, __constant)
DECL_ONE_SPACE_MEMCOPY_FN(g, __global)
DECL_ONE_SPACE_MEMCOPY_FN(l, __local)
DECL_ONE_SPACE_MEMCOPY_FN(p, __private)
+DECL_ONE_SPACE_MEMCOPY_FN(n, __generic)
+#else
+#define DECL_ONE_SPACE_MEMCOPY_FN(NAME, DST_SPACE) \
+ DECL_TWO_SPACE_MEMCOPY_FN( NAME## g, DST_SPACE, __global) \
+ DECL_TWO_SPACE_MEMCOPY_FN( NAME## l, DST_SPACE, __local) \
+ DECL_TWO_SPACE_MEMCOPY_FN( NAME## p, DST_SPACE, __private) \
+ DECL_TWO_SPACE_MEMCOPY_FN( NAME## c, DST_SPACE, __constant)
+
+DECL_ONE_SPACE_MEMCOPY_FN(g, __global)
+DECL_ONE_SPACE_MEMCOPY_FN(l, __local)
+DECL_ONE_SPACE_MEMCOPY_FN(p, __private)
+
+#endif
diff --git a/backend/src/libocl/src/ocl_memset.cl b/backend/src/libocl/src/ocl_memset.cl
index d8bc5df..dda7e55 100644
--- a/backend/src/libocl/src/ocl_memset.cl
+++ b/backend/src/libocl/src/ocl_memset.cl
@@ -41,4 +41,7 @@ void __gen_memset_ ##NAME (DST_SPACE uchar* dst, uchar val, size_t size) { \
DECL_MEMSET_FN(g, __global)
DECL_MEMSET_FN(l, __local)
DECL_MEMSET_FN(p, __private)
+#if (__OPENCL_C_VERSION__ >= 200)
+DECL_MEMSET_FN(n, __generic)
+#endif
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 94bf178..3b2eb92 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -229,3 +229,27 @@ struct time_stamp __gen_ocl_get_timestamp(void) {
return val;
};
+
+bool __gen_ocl_in_local(size_t p) {
+ bool cond1 = p > 0;
+ bool cond2 = p < 64*1024;
+ return cond1 && cond2;
+}
+
+#if (__OPENCL_C_VERSION__ >= 200)
+local void *__to_local(generic void *p) {
+ bool cond = __gen_ocl_in_local((size_t)p);
+ return cond ? (local void*)p : NULL;
+}
+private void *__to_private(generic void *p) {
+ bool cond = __gen_ocl_in_private((size_t)p);
+ return cond ? (private void*)p : NULL;
+}
+
+global void *__to_global(generic void *p) {
+ bool cond1 = __gen_ocl_in_local((size_t)p);
+ bool cond2 = __gen_ocl_in_private((size_t)p);
+ bool cond = cond1 || cond2;
+ return !cond ? (global void*)p : NULL;
+}
+#endif
diff --git a/backend/src/libocl/src/ocl_pipe.cl b/backend/src/libocl/src/ocl_pipe.cl
new file mode 100644
index 0000000..7bfd370
--- /dev/null
+++ b/backend/src/libocl/src/ocl_pipe.cl
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_pipe.h"
+#include "ocl_atom.h"
+#include "ocl_workitem.h"
+
+#define PIPE_SUCCESS 0
+#define PIPE_EMPTY -2
+#define PIPE_FULL -3
+#define PIPE_HEADER_SZ 128
+#define PIPE_INDEX_OUTRANGE -4
+#define PIPE_RESERVE_FAIL -5
+#define RID_MAGIC 0xDE
+#define RIDT ushort
+#define DEAD_PTR 0xFFFFFFFF
+
+PURE CONST __global void* __gen_ocl_get_pipe(pipe int p);
+PURE CONST ulong __gen_ocl_get_rid(reserve_id_t rid);
+PURE CONST reserve_id_t __gen_ocl_make_rid(ulong rid);
+
+int __read_pipe_2(pipe int p, __generic void* dst)
+{
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int data_size = atomic_sub(pheader + 6, 1);
+ if(data_size < 0){
+ atomic_add(pheader + 6, 1);
+ return PIPE_EMPTY; //Check if element exist
+ }
+ __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+ int pack_num = pheader[0];
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int read_ptr = atomic_add(pheader + 3, 1);
+ if(read_ptr == pack_num - 1)
+ atomic_sub(pheader + 3, pack_num);
+ read_ptr = read_ptr % pack_num;
+ for(int i = 0; i < pack_size ; i++)
+ ((char*)dst)[i] = psrc[i + read_ptr*pack_size];
+ return 0;
+}
+
+int __read_pipe_4(pipe int p, reserve_id_t id, uint index, void* dst)
+{
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+ ulong uid = __gen_ocl_get_rid(id);
+ RIDT* pid = (RIDT*)&uid;
+ RIDT start_pt = pid[0];
+ RIDT reserve_size = pid[1];
+ if(index > reserve_size) return PIPE_INDEX_OUTRANGE;
+ int pack_num = pheader[0];
+ int pack_size = pheader[1];
+ int read_ptr = (start_pt + index) % pack_num;
+ int offset = read_ptr * pack_size;
+ for(int i = 0; i < pack_size ; i++)
+ ((char*)dst)[i] = psrc[i + offset];
+ return 0;
+}
+
+
+int __write_pipe_2(pipe int p, __generic void* src)
+{
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int pack_num = pheader[0];
+ int data_size = atomic_add(pheader + 6, 1);
+ if(data_size >= pack_num){
+ atomic_sub(pheader + 6, 1);
+ return PIPE_FULL; //Check if pipe full
+ }
+ __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int write_ptr = atomic_add(pheader + 2, 1);
+ if(write_ptr == pack_num - 1)
+ atomic_sub(pheader + 2, pack_num);
+ write_ptr = write_ptr % pack_num;
+ for(int i = 0; i < pack_size ; i++)
+ psrc[i + write_ptr * pack_size] = ((char*)src)[i];
+ return 0;
+}
+
+int __write_pipe_4(pipe int p, reserve_id_t id, uint index, void* src)
+{
+ __global int* pheader = __gen_ocl_get_pipe(p);
+ __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+ ulong uid = __gen_ocl_get_rid(id);
+ RIDT* pid = (RIDT*)&uid;
+ RIDT start_pt = pid[0];
+ RIDT reserve_size = pid[1];
+ if(index > reserve_size) return PIPE_INDEX_OUTRANGE;
+ int pack_num = pheader[0];
+ int pack_size = pheader[1];
+ int write_ptr = (start_pt + index) % pack_num;
+ int offset = write_ptr * pack_size;
+ for(int i = 0; i < pack_size ; i++)
+ psrc[i + offset] = ((char*)src)[i];
+ return pack_size;
+}
+
+reserve_id_t __reserve_read_pipe(pipe int p, uint num)
+{
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int data_size = atomic_sub(pheader + 6, num);
+ if(data_size < num){
+ atomic_add(pheader + 6, num);
+ return __gen_ocl_make_rid(0l);
+ }
+ int pack_num = pheader[0];
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int read_ptr = atomic_add(pheader + 3, num);
+ if(read_ptr == pack_num - num)
+ atomic_sub(pheader + 3, pack_num);
+ ulong uid = 0l;
+ RIDT* pid = (RIDT*)&uid;
+ pid[0] = read_ptr % pack_num;
+ pid[1] = num;
+ pid[2] = RID_MAGIC ;
+ return __gen_ocl_make_rid(uid);
+}
+
+void __commit_read_pipe(pipe int p, reserve_id_t rid) {}
+
+reserve_id_t __work_group_reserve_read_pipe(pipe int p, uint num)
+{
+ uint rid_ptr = DEAD_PTR;
+ int ret0 = 0;
+ if(get_local_linear_id()==0){
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int data_size = atomic_sub(pheader + 6, num);
+ if(data_size < num){
+ atomic_add(pheader + 6, num);
+ int ret0 = 1;
+ }
+ int pack_num = pheader[0];
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int read_ptr = atomic_add(pheader + 3, num);
+ if(read_ptr == pack_num - num && !ret0)
+ atomic_sub(pheader + 3, pack_num);
+ if(!ret0)
+ rid_ptr = read_ptr % pack_num;
+ }
+ ulong uid = 0l;
+ RIDT* pid = (RIDT*)&uid;
+ rid_ptr = work_group_broadcast(rid_ptr,0,0,0);
+ pid[0] = rid_ptr;
+ pid[1] = num;
+ pid[2] = RID_MAGIC ;
+ if(rid_ptr == DEAD_PTR)
+ uid = 0l;
+ return __gen_ocl_make_rid(uid);
+}
+
+void __work_group_commit_read_pipe(pipe int p, reserve_id_t rid) {}
+
+reserve_id_t __sub_group_reserve_read_pipe(pipe int p, uint num)
+{
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int data_size = atomic_sub(pheader + 6, num);
+ if(data_size < num){
+ atomic_add(pheader + 6, num);
+ return __gen_ocl_make_rid(0l);
+ }
+ int pack_num = pheader[0];
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int read_ptr = atomic_add(pheader + 3, num);
+ if(read_ptr == pack_num - num)
+ atomic_sub(pheader + 3, pack_num);
+ ulong uid = 0l;
+ RIDT* pid = (RIDT*)&uid;
+ pid[0] = read_ptr % pack_num;
+ pid[1] = num;
+ pid[2] = RID_MAGIC ;
+ return __gen_ocl_make_rid(uid);
+}
+
+void __sub_group_commit_read_pipe(pipe int p, reserve_id_t rid) {}
+
+reserve_id_t __reserve_write_pipe(pipe int p, uint num)
+{
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int pack_num = pheader[0];
+ int data_size = atomic_add(pheader + 6, num);
+ if(data_size > pack_num - num){
+ atomic_sub(pheader + 6, num);
+ return __gen_ocl_make_rid(0l);
+ }
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int write_ptr = atomic_add(pheader + 2, num);
+ if(write_ptr == pack_num - num)
+ atomic_sub(pheader + 2, pack_num);
+ ulong uid = 0l;
+ RIDT* pid = (RIDT*)&uid;
+ pid[0] = write_ptr % pack_num;
+ pid[1] = num;
+ pid[2] = RID_MAGIC ;
+ return __gen_ocl_make_rid(uid);
+}
+void __commit_write_pipe(pipe int p, reserve_id_t rid) {}
+
+reserve_id_t __work_group_reserve_write_pipe(pipe int p, uint num)
+{
+ uint rid_ptr = DEAD_PTR;
+ int ret0 = 0;
+ if(get_local_linear_id()==0){
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int pack_num = pheader[0];
+ int data_size = atomic_add(pheader + 6, num);
+ if(data_size > pack_num - num){
+ atomic_sub(pheader + 6, num);
+ ret0 = 1;
+ }
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int write_ptr = atomic_add(pheader + 2, num);
+ if(write_ptr == pack_num - num && !ret0)
+ atomic_sub(pheader + 2, pack_num);
+ if(!ret0)
+ rid_ptr = write_ptr % pack_num;
+ }
+ ulong uid = 0l;
+ RIDT* pid = (RIDT*)&uid;
+ rid_ptr = work_group_broadcast(rid_ptr,0,0,0);
+ pid[0] = rid_ptr;
+ pid[1] = num;
+ pid[2] = RID_MAGIC ;
+ if(rid_ptr == DEAD_PTR)
+ uid = 0l;
+ return __gen_ocl_make_rid(uid);
+}
+void __work_group_commit_write_pipe(pipe int p, reserve_id_t rid) {}
+
+
+reserve_id_t __sub_group_reserve_write_pipe(pipe int p, uint num)
+{
+ __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+ int pack_num = pheader[0];
+ int data_size = atomic_add(pheader + 6, num);
+ if(data_size > pack_num - num){
+ atomic_sub(pheader + 6, num);
+ return __gen_ocl_make_rid(0l);
+ }
+ int pack_size = pheader[1];
+ int pipe_size = pack_num * pack_size;
+ int write_ptr = atomic_add(pheader + 2, num);
+ if(write_ptr == pack_num - num)
+ atomic_sub(pheader + 2, pack_num);
+ ulong uid = 0l;
+ RIDT* pid = (RIDT*)&uid;
+ pid[0] = write_ptr % pack_num;
+ pid[1] = num;
+ pid[2] = RID_MAGIC ;
+ return __gen_ocl_make_rid(uid);
+}
+
+void __sub_group_commit_write_pipe(pipe int p, reserve_id_t rid) {}
+
+bool is_valid_reserve_id(reserve_id_t rid)
+{
+ ulong uid = __gen_ocl_get_rid(rid);
+ RIDT* pid = (RIDT*)&uid;
+ if(pid[1] == 0) return false;
+ if(pid[2] != RID_MAGIC) return false;
+ return true;
+}
+
+/* Query Function */
+uint __get_pipe_max_packets(pipe int p)
+{
+ __global int* pheader = __gen_ocl_get_pipe(p);
+ return pheader[0];
+}
+
+uint __get_pipe_num_packets(pipe int p)
+{
+ __global int* pheader = __gen_ocl_get_pipe(p);
+ return pheader[6];
+}
diff --git a/backend/src/libocl/src/ocl_sync.cl b/backend/src/libocl/src/ocl_sync.cl
index b6efef8..590596a 100644
--- a/backend/src/libocl/src/ocl_sync.cl
+++ b/backend/src/libocl/src/ocl_sync.cl
@@ -16,10 +16,10 @@
*
*/
#include "ocl_sync.h"
+#include "ocl_misc.h"
void __gen_ocl_barrier_local(void);
void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
void __gen_ocl_debugwait(void);
OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) {
@@ -30,3 +30,7 @@ OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags) {
OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags) {
}
+cl_mem_fence_flags get_fence(void *ptr) {
+ bool cond = __gen_ocl_in_local((size_t)ptr);
+ return cond ? CLK_LOCAL_MEM_FENCE : CLK_GLOBAL_MEM_FENCE;
+}
diff --git a/backend/src/libocl/src/ocl_vload_20.cl b/backend/src/libocl/src/ocl_vload_20.cl
new file mode 100644
index 0000000..ab06aa2
--- /dev/null
+++ b/backend/src/libocl/src/ocl_vload_20.cl
@@ -0,0 +1,284 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#include "ocl_vload_20.h"
+#include "ocl_relational.h"
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+ return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+ *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+ return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+}
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+ DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __generic)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+}
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 2 * offset) = v.s0; \
+ *(p + 2 * offset + 1) = v.s1; \
+} \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+ vstore2(v.lo, 2*offset, p); \
+ vstore2(v.hi, 2*offset, p+2); \
+} \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+ vstore4(v.lo, 2*offset, p); \
+ vstore4(v.hi, 2*offset, p+4); \
+} \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+ vstore8(v.lo, 2*offset, p); \
+ vstore8(v.hi, 2*offset, p+8); \
+}
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+ DECL_BYTE_RD_SPACE(TYPE, __generic) \
+ DECL_BYTE_RD_SPACE(TYPE, __constant) \
+ DECL_BYTE_WR_SPACE(TYPE, __generic)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(half)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+
+OVERLOADABLE short f32to16_rtp(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (f > con)
+ return s - signbit(f) * 2 + 1;
+ else
+ return s;
+}
+
+OVERLOADABLE short f32to16_rtn(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (con > f)
+ return s + signbit(f) * 2 - 1;
+ else
+ return s;
+}
+
+OVERLOADABLE short f32to16_rtz(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (((con > f) && !signbit(f)) ||
+ ((con < f) && signbit(f)))
+ return s - 1;
+ else
+ return s;
+}
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+ return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p) { \
+ return vload_half(offset, p); \
+} \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+ return (float2)(vload_half(offset*2, p), \
+ vload_half(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p) { \
+ return (float2)(vloada_half(offset*2, p), \
+ vloada_half(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+ return (float3)(vload_half(offset*3, p), \
+ vload_half(offset*3 + 1, p), \
+ vload_half(offset*3 + 2, p)); \
+} \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+ return (float3)(vload_half(offset*4, p), \
+ vload_half(offset*4 + 1, p), \
+ vload_half(offset*4 + 2, p)); \
+} \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+ return (float4)(vload_half2(offset*2, p), \
+ vload_half2(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p) { \
+ return (float4)(vloada_half2(offset*2, p), \
+ vloada_half2(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+ return (float8)(vload_half4(offset*2, p), \
+ vload_half4(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p) { \
+ return (float8)(vloada_half4(offset*2, p), \
+ vloada_half4(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
+ return (float16)(vload_half8(offset*2, p), \
+ vload_half8(offset*2 + 1, p)); \
+}\
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p) { \
+ return (float16)(vloada_half8(offset*2, p), \
+ vloada_half8(offset*2 + 1, p)); \
+}\
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
+ *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.lo, offset*2, p); \
+ vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+ vstore_half2##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.s0, offset*3, p); \
+ vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+ vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.s0, offset*4, p); \
+ vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+ vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+ vstore_half2##ROUND(data.lo, offset*2, p); \
+ vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+ vstore_half4##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+ vstore_half4##ROUND(data.lo, offset*2, p); \
+ vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+ vstore_half8##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+ vstore_half8##ROUND(data.lo, offset*2, p); \
+ vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+ vstore_half16##ROUND(data, offset, p); \
+}
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, , __gen_ocl_f32to16) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
+
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__generic)
+
+DECL_HALF_ST_SPACE(__generic)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
index 235f12b..eb6210d 100644
--- a/backend/src/libocl/src/ocl_workitem.cl
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -30,6 +30,7 @@ PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
PURE CONST unsigned int __gen_ocl_##NAME##2(void);
DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_enqueued_local_size)
DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
@@ -37,7 +38,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
#undef DECL_INTERNAL_WORK_ITEM_FN
#define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET) \
-OVERLOADABLE unsigned NAME(unsigned int dim) { \
+OVERLOADABLE size_t NAME(unsigned int dim) { \
if (dim == 0) return __gen_ocl_##NAME##0(); \
else if (dim == 1) return __gen_ocl_##NAME##1(); \
else if (dim == 2) return __gen_ocl_##NAME##2(); \
@@ -46,24 +47,18 @@ OVERLOADABLE unsigned NAME(unsigned int dim) { \
DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_enqueued_local_size, 1)
DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
#undef DECL_PUBLIC_WORK_ITEM_FN
-OVERLOADABLE uint get_global_id(uint dim) {
- return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
+OVERLOADABLE size_t get_global_id(uint dim) {
+ return get_local_id(dim) + get_enqueued_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
}
-OVERLOADABLE uint get_enqueued_local_size (uint dimindx)
-{
- //TODO: should be different with get_local_size when support
- //non-uniform work-group size
- return get_local_size(dimindx);
-}
-
-OVERLOADABLE uint get_global_linear_id(void)
+OVERLOADABLE size_t get_global_linear_id(void)
{
uint dim = __gen_ocl_get_work_dim();
if (dim == 1) return get_global_id(0) - get_global_offset(0);
@@ -76,12 +71,12 @@ OVERLOADABLE uint get_global_linear_id(void)
else return 0;
}
-OVERLOADABLE uint get_local_linear_id(void)
+OVERLOADABLE size_t get_local_linear_id(void)
{
uint dim = __gen_ocl_get_work_dim();
if (dim == 1) return get_local_id(0);
- else if (dim == 2) return get_local_id(1) * get_local_size (0) + get_local_id(0);
- else if (dim == 3) return (get_local_id(2) * get_local_size(1) * get_local_size(0)) +
- (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+ else if (dim == 2) return get_local_id(1) * get_enqueued_local_size(0) + get_local_id(0);
+ else if (dim == 3) return (get_local_id(2) * get_enqueued_local_size(1) * get_local_size(0)) +
+ (get_local_id(1) * get_enqueued_local_size(0)) + get_local_id(0);
else return 0;
}
diff --git a/backend/src/libocl/tmpl/ocl_defines.tmpl.h b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
index f5c65df..c16a99f 100644
--- a/backend/src/libocl/tmpl/ocl_defines.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
@@ -18,13 +18,18 @@
#ifndef __OCL_COMMON_DEF_H__
#define __OCL_COMMON_DEF_H__
-#define __OPENCL_VERSION__ 120
#define __CL_VERSION_1_0__ 100
#define __CL_VERSION_1_1__ 110
#define __CL_VERSION_1_2__ 120
#define CL_VERSION_1_0 100
#define CL_VERSION_1_1 110
#define CL_VERSION_1_2 120
+#if (__OPENCL_C_VERSION__ >= 200)
+#define __OPENCL_VERSION__ 200
+#define CL_VERSION_2_0 200
+#else
+#define __OPENCL_VERSION__ 120
+#endif
#define __ENDIAN_LITTLE__ 1
#define __IMAGE_SUPPORT__ 1
#define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 7e7f4ae..3327389 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -40,6 +40,18 @@ SDEF(long, s, 64);
SDEF(ulong, u, 64);
#undef SDEF
+#define SDEF(TYPE, TYPE_NAME, SIZE) \
+OVERLOADABLE TYPE ctz(TYPE x){ return ctz_##TYPE_NAME##SIZE(x);}
+SDEF(char, s, 8);
+SDEF(uchar, u, 8);
+SDEF(short, s, 16);
+SDEF(ushort, u, 16);
+SDEF(int, s, 32);
+SDEF(uint, u, 32);
+SDEF(long, s, 64);
+SDEF(ulong, u, 64);
+#undef SDEF
+
#define SDEF(TYPE) \
OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
SDEF(char);
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
index 4b3b5ae..ac1800b 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -54,6 +54,24 @@ uint clz_u32(uint);
long clz_s64(long);
ulong clz_u64(ulong);
+OVERLOADABLE char ctz(char x);
+OVERLOADABLE uchar ctz(uchar x);
+OVERLOADABLE short ctz(short x);
+OVERLOADABLE ushort ctz(ushort x);
+OVERLOADABLE int ctz(int x);
+OVERLOADABLE uint ctz(uint x);
+OVERLOADABLE long ctz(long x);
+OVERLOADABLE ulong ctz(ulong x);
+
+char ctz_s8(char);
+uchar ctz_u8(uchar);
+short ctz_s16(short);
+ushort ctz_u16(ushort);
+int ctz_s32(int);
+uint ctz_u32(uint);
+long ctz_s64(long);
+ulong ctz_u64(ulong);
+
OVERLOADABLE char popcount(char x);
OVERLOADABLE uchar popcount(uchar x);
OVERLOADABLE short popcount(short x);
diff --git a/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl
new file mode 100644
index 0000000..d47e0a2
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl
@@ -0,0 +1,3801 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_math_20.h"
+#include "ocl_float.h"
+#include "ocl_relational.h"
+#include "ocl_common.h"
+#include "ocl_integer.h"
+
+extern constant int __ocl_math_fastpath_flag;
+
+CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
+CONST float __gen_ocl_sin(float x) __asm("llvm.sin" ".f32");
+CONST float __gen_ocl_cos(float x) __asm("llvm.cos" ".f32");
+CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
+PURE CONST float __gen_ocl_rsqrt(float x);
+CONST float __gen_ocl_log(float x) __asm("llvm.log2" ".f32");
+CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
+PURE CONST float __gen_ocl_pow(float x, float y) __asm("llvm.pow" ".f32");
+PURE CONST float __gen_ocl_rcp(float x);
+CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
+CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
+CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
+CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
+
+
+/* native functions */
+OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+OVERLOADABLE float native_log(float x) {
+ return native_log2(x) * 0.6931472002f;
+}
+OVERLOADABLE float native_log10(float x) {
+ return native_log2(x) * 0.3010299956f;
+}
+OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+OVERLOADABLE float native_tan(float x) {
+ return native_sin(x) / native_cos(x);
+}
+OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
+OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
+OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
+OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+
+/* Fast path */
+OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x) {
+ return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x) {
+ return native_log(x + native_sqrt(x * x + 1));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x) {
+ return 0.5f * native_log((1 + x) / (1 - x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x) {
+ return __gen_ocl_pow(x, 0.3333333333f);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x) {
+ return native_cos(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
+ return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
+ return __gen_ocl_cos(x * M_PI_F);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
+ return native_exp(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x) {
+ return native_exp10(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x) {
+ return __gen_ocl_pow(M_E_F, x) - 1;
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y) {
+ return x-y*__gen_ocl_rndz(x/y);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y) {
+ return __gen_ocl_sqrt(x*x + y*y);
+}
+OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x) {
+ return __gen_ocl_rndd(native_log2(x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n) {
+ return __gen_ocl_pow(2, n) * x;
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x) {
+ return native_log(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x) {
+ return native_log2(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x) {
+ return native_log10(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x) {
+ return native_log(x + 1);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x) {
+ return __gen_ocl_rndd(native_log2(x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y) {
+ return x-y*__gen_ocl_rnde(x/y);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n) {
+ return __gen_ocl_pow(x, 1.f / n);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x) {
+ return native_sin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, float *cosval) {
+ *cosval = native_cos(x);
+ return native_sin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x) {
+ return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x) {
+ return __gen_ocl_sin(x * M_PI_F);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x) {
+ return native_tan(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x) {
+ float y = native_exp(-2 * x);
+ return (1 - y) / (1 + y);
+}
+
+
+/* Internal implement, high accuracy. */
+OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+ union { unsigned u; float f; } ux, uy;
+ ux.f = x;
+ uy.f = y;
+ ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
+ return ux.f;
+}
+
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ union { unsigned int i; float f; } u;
+ const float
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ two25 = 3.355443200e+07, /* 0x4c000000 */
+ Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
+ Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
+ Lg3 = 2.8571429849e-01, /* 3E924925 */
+ Lg4 = 2.2222198546e-01; /* 3E638E29 */
+
+ const float zero = 0.0;
+ float fsq, f, s, z, R, w, t1, t2, partial;
+ int k, ix, i, j;
+
+ u.f = x; ix = u.i;
+ k = 0;
+
+ k += (ix>>23) - 127;
+ ix &= 0x007fffff;
+ i = (ix + (0x95f64<<3)) & 0x800000;
+ u.i = ix | (i^0x3f800000); x = u.f;
+ k += (i>>23);
+ f = x - 1.0f;
+ fsq = f * f;
+
+ if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
+ R = fsq * (0.5f - 0.33333333333333333f * f);
+ return k * ln2_hi + k * ln2_lo + f - R;
+ }
+
+ s = f / (2.0f + f);
+ z = s * s;
+ i = ix - (0x6147a << 3);
+ w = z * z;
+ j = (0x6b851 << 3) - ix;
+ t1= w * mad(w, Lg4, Lg2);
+ t2= z * mad(w, Lg3, Lg1);
+ i |= j;
+ R = t2 + t1;
+ partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+
+ return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
+}
+
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+{
+ union { unsigned int i; float f; } u;
+ u.f = x;
+ int ix = u.i;
+
+ if (ix < 0 )
+ return NAN; /* log(-#) = NaN */
+ if (ix >= 0x7f800000)
+ return NAN;
+
+ return __gen_ocl_internal_log_valid(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+{
+ union { float f; unsigned i; } u;
+ const float
+ ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */
+ log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */
+ log10_2lo = 7.9034151668e-07; /* 0x355427db */
+
+ float y, z;
+ int i, k, hx;
+
+ u.f = x; hx = u.i;
+
+ if (hx<0)
+ return NAN; /* log(-#) = NaN */
+ if (hx >= 0x7f800000)
+ return NAN;
+
+ k = (hx >> 23) - 127;
+ i = ((unsigned)k & 0x80000000) >> 31;
+ hx = (hx&0x007fffff) | ((0x7f-i) << 23);
+ y = (float)(k + i);
+ u.i = hx; x = u.f;
+
+ return y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
+}
+
+
+OVERLOADABLE float __gen_ocl_internal_log2(float x)
+{
+ const float zero = 0.0,
+ invln2 = 0x1.715476p+0f;
+ int ix;
+
+ union { float f; int i; } u;
+ u.f = x; ix = u.i;
+
+ if (ix < 0)
+ return NAN; /** log(-#) = NaN */
+ if (ix >= 0x7f800000)
+ return NAN;
+
+ return invln2 * __gen_ocl_internal_log_valid(x);
+}
+
+
+float __gen_ocl_scalbnf (float x, int n){
+ /* copy from fdlibm */
+ float two25 = 3.355443200e+07, /* 0x4c000000 */
+ twom25 = 2.9802322388e-08, /* 0x33000000 */
+ huge = 1.0e+30,
+ tiny = 1.0e-30;
+ int k,ix;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ k = (ix&0x7f800000)>>23; /* extract exponent */
+ if (k==0) { /* 0 or subnormal x */
+ if ((ix&0x7fffffff)==0) return x; /* +-0 */
+ x *= two25;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ k = ((ix&0x7f800000)>>23) - 25;
+ }
+ if (k==0xff) return x+x; /* NaN or Inf */
+ if (n< -50000)
+ return tiny*__gen_ocl_internal_copysign(tiny,x); /*underflow*/
+ if (n> 50000 || k+n > 0xfe)
+ return huge*__gen_ocl_internal_copysign(huge,x); /* overflow */
+ /* Now k and n are bounded we know that k = k+n does not overflow. */
+ k = k+n;
+ if (k > 0) { /* normal result */
+ GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+ return x;
+ }
+ if (k <= -25)
+ return tiny*__gen_ocl_internal_copysign(tiny,x); /*underflow*/
+ k += 25; /* subnormal result */
+ GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+ return x*twom25;
+}
+
+const __constant unsigned int two_over_pi[] = {
+0, 0, 0xA2F, 0x983, 0x6E4, 0xe44, 0x152, 0x9FC,
+0x275, 0x7D1, 0xF53, 0x4DD, 0xC0D, 0xB62,
+0x959, 0x93C, 0x439, 0x041, 0xFE5, 0x163,
+};
+
+// The main idea is from "Radian Reduction for Trigonometric Functions"
+// written by Mary H. Payne and Robert N. Hanek. Also another reference
+// is "A Continued-Fraction Analysis of Trigonometric Argument Reduction"
+// written by Roger Alan Smith, who gave the worst case in this paper.
+// for single float, worst x = 0x1.47d0fep34, and there are 29 bit
+// leading zeros in the fraction part of x*(2.0/pi). so we need at least
+// 29 (leading zero)+ 24 (fraction )+12 (integer) + guard bits. that is,
+// 65 + guard bits, as we calculate in 12*7 = 84bits, which means we have
+// about 19 guard bits. If we need further precision, we may need more
+// guard bits
+// Note we place two 0 in two_over_pi, which is used to handle input less
+// than 0x1.0p23
+
+int payne_hanek(float x, float *y) {
+ union { float f; unsigned u;} ieee;
+ ieee.f = x;
+ unsigned u = ieee.u;
+ int k = ((u & 0x7f800000) >> 23)-127;
+ int ma = (u & 0x7fffff) | 0x800000;
+ unsigned high, low;
+ high = (ma & 0xfff000) >> 12;
+ low = ma & 0xfff;
+
+ // Two tune below macro, you need to fully understand the algorithm
+#define CALC_BLOCKS 7
+#define ZERO_BITS 2
+
+ unsigned result[CALC_BLOCKS];
+
+ // round down, note we need 2 bits integer precision
+ int index = (k-23-2) < 0 ? (k-23-2-11)/12 : (k-23-2)/12;
+
+ for (int i = 0; i < CALC_BLOCKS; i++) {
+ result[i] = low * two_over_pi[index+i+ZERO_BITS] ;
+ result[i] += high * two_over_pi[index+i+1+ZERO_BITS];
+ }
+
+ for (int i = CALC_BLOCKS-1; i > 0; i--) {
+ int temp = result[i] >> 12;
+ result[i] -= temp << 12;
+ result[i-1] += temp;
+ }
+#undef CALC_BLOCKS
+#undef ZERO_BITS
+
+ // get number of integer digits in result[0], note we only consider 12 valid bits
+ // and also it means the fraction digits in result[0] is (12-intDigit)
+
+ int intDigit = index*(-12) + (k-23);
+
+ // As the integer bits may be all included in result[0], and also maybe
+ // some bits in result[0], and some in result[1]. So we merge succesive bits,
+ // which makes easy coding.
+
+ unsigned b0 = (result[0] << 12) | result[1];
+ unsigned b1 = (result[2] << 12) | result[3];
+ unsigned b2 = (result[4] << 12) | result[5];
+ unsigned b3 = (result[6] << 12);
+
+ unsigned intPart = b0 >> (24-intDigit);
+
+ unsigned fract1 = ((b0 << intDigit) | (b1 >> (24-intDigit))) & 0xffffff;
+ unsigned fract2 = ((b1 << intDigit) | (b2 >> (24-intDigit))) & 0xffffff;
+ unsigned fract3 = ((b2 << intDigit) | (b3 >> (24-intDigit))) & 0xffffff;
+
+ // larger than 0.5? which mean larger than pi/4, we need
+ // transform from [0,pi/2] to [-pi/4, pi/4] through -(1.0-fract)
+ int largerPiBy4 = ((fract1 & 0x800000) != 0);
+ int sign = largerPiBy4 ? 1 : 0;
+ intPart = largerPiBy4 ? (intPart+1) : intPart;
+
+ fract1 = largerPiBy4 ? (fract1 ^ 0x00ffffff) : fract1;
+ fract2 = largerPiBy4 ? (fract2 ^ 0x00ffffff) : fract2;
+ fract3 = largerPiBy4 ? (fract3 ^ 0x00ffffff) : fract3;
+
+ int leadingZero = (fract1 == 0);
+
+ // +1 is for the hidden bit 1 in floating-point format
+ int exponent = leadingZero ? -(24+1) : -(0+1);
+
+ fract1 = leadingZero ? fract2 : fract1;
+ fract2 = leadingZero ? fract3 : fract2;
+
+ // fract1 may have leading zeros, add it
+ int shift = clz(fract1)-8;
+ exponent += -shift;
+
+ float pio2 = 0x1.921fb6p+0;
+ unsigned fdigit = ((fract1 << shift) | (fract2 >> (24-shift))) & 0xffffff;
+
+ // we know that denormal number will not appear here
+ ieee.u = (sign << 31) | ((exponent+127) << 23) | (fdigit & 0x7fffff);
+ *y = ieee.f * pio2;
+ return intPart;
+}
+
+int argumentReduceSmall(float x, float * remainder) {
+ union {
+ float f;
+ unsigned u;
+ } ieee;
+
+ float twoByPi = 2.0f/3.14159265f;
+ float piBy2_1h = (float) 0xc90/0x1.0p11,
+ piBy2_1l = (float) 0xfda/0x1.0p23,
+ piBy2_2h = (float) 0xa22/0x1.0p35,
+ piBy2_2l = (float) 0x168/0x1.0p47,
+ piBy2_3h = (float) 0xc23/0x1.0p59,
+ piBy2_3l = (float) 0x4c4/0x1.0p71;
+
+ float y = (float)(int)(twoByPi * x + 0.5f);
+ ieee.f = y;
+ ieee.u = ieee.u & 0xfffff000;
+
+ float yh = ieee.f;
+ float yl = y - yh;
+ float rem = x - yh*piBy2_1h - yh*piBy2_1l - yl*piBy2_1h - yl*piBy2_1l;
+ rem = rem - yh*piBy2_2h - yh*piBy2_2l + yl*piBy2_2h + yl*piBy2_2l;
+ rem = rem - yh*piBy2_3h - yh*piBy2_3l - yl*piBy2_3h - yl*piBy2_3l;
+
+ *remainder = rem;
+ return (int)y;
+}
+
+
+int __ieee754_rem_pio2f(float x, float *y) {
+ if (x < 4000.0f) {
+ return argumentReduceSmall(x, y);
+ } else {
+ return payne_hanek(x, y);
+ }
+}
+
+OVERLOADABLE float __kernel_sinf(float x)
+{
+ /* copied from fdlibm */
+ const float
+ S1 = -1.6666667163e-01, /* 0xbe2aaaab */
+ S2 = 8.3333337680e-03, /* 0x3c088889 */
+ S3 = -1.9841270114e-04, /* 0xb9500d01 */
+ S4 = 2.7557314297e-06; /* 0x3638ef1b */
+ float z,r,v;
+ z = x*x;
+ v = z*x;
+ r = mad(z, mad(z, mad(z, S4, S3), S2), S1);
+
+ return mad(v, r, x);
+}
+
+float __kernel_cosf(float x, float y)
+{
+ /* copied from fdlibm */
+ const float
+ one = 1.0000000000e+00, /* 0x3f800000 */
+ C1 = 4.1666667908e-02, /* 0x3d2aaaab */
+ C2 = -1.3888889225e-03, /* 0xbab60b61 */
+ C3 = 2.4801587642e-05; /* 0x37d00d01 */
+ float a,hz,z,r,qx;
+ int ix;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ ix &= 0x7fffffff; /* ix = |x|'s high word*/
+ z = x*x;
+ r = z * mad(z, mad(z, C3, C2), C1);
+
+ if(ix < 0x3e99999a) /* if |x| < 0.3 */
+ return one - ((float)0.5*z - (z*r - x*y));
+ else {
+ GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
+ hz = (float)0.5*z-qx;
+ a = one-qx;
+ return a - (hz - (z*r-x*y));
+ }
+}
+
+OVERLOADABLE float sin(float x)
+{
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_sin(x);
+
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
+ float y,z=0.0;
+ int n, ix;
+
+ float negative = x < 0.0f? -1.0f : 1.0f;
+ x = fabs(x);
+
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ ix &= 0x7fffffff;
+
+ /* sin(Inf or NaN) is NaN */
+ if (ix >= 0x7f800000) return x-x;
+
+ if(x <= pio4)
+ return negative * __kernel_sinf(x);
+ /* argument reduction needed */
+ else {
+ n = __ieee754_rem_pio2f(x,&y);
+ float s = __kernel_sinf(y);
+ float c = __kernel_cosf(y,0.0f);
+ float ret = (n&1) ? negative*c : negative*s;
+ return (n&3)> 1? -1.0f*ret : ret;
+ }
+}
+
+OVERLOADABLE float cos(float x)
+{
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_cos(x);
+
+ const float pio4 = 7.8539812565e-01; /* 0x3f490fda */
+ float y,z=0.0;
+ int n, ix;
+ x = __gen_ocl_fabs(x);
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+ ix &= 0x7fffffff;
+
+ /* cos(Inf or NaN) is NaN */
+ if (ix >= 0x7f800000) return x-x;
+
+ if(x <= pio4)
+ return __kernel_cosf(x, 0.f);
+ /* argument reduction needed */
+ else {
+ n = __ieee754_rem_pio2f(x,&y);
+ n &= 3;
+ float c = __kernel_cosf(y, 0.0f);
+ float s = __kernel_sinf(y);
+ float v = (n&1) ? s : c;
+ /* n&3 return
+ 0 cos(y)
+ 1 -sin(y)
+ 2 -cos(y)
+ 3 sin(y)
+ */
+ int mask = (n>>1) ^ n;
+ float sign = (mask&1) ? -1.0f : 1.0f;
+ return sign * v;
+ }
+}
+
+float __kernel_tanf(float x, float y, int iy)
+{
+ /* copied from fdlibm */
+ float z,r,v,w,s;
+ int ix,hx;
+ const float
+ one = 1.0000000000e+00, /* 0x3f800000 */
+ pio4 = 7.8539812565e-01, /* 0x3f490fda */
+ pio4lo= 3.7748947079e-08; /* 0x33222168 */
+ float T[13];// = {
+ T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
+ T[1] = 1.3333334029e-01; /* 0x3e088889 */
+ T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
+ T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
+ T[4] = 8.8632395491e-03; /* 0x3c11371f */
+ T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
+ T[6] = 1.4562094584e-03; /* 0x3abede48 */
+ T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff; /* high word of |x| */
+ if(ix<0x31800000) /* x < 2**-28 */
+ {if((int)x==0) { /* generate inexact */
+ if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
+ else return (iy==1)? x: -one/x;
+ }
+ }
+ if(ix>=0x3f2ca140) { /* |x|>=0.6744 */
+ if(hx<0) {x = -x; y = -y;}
+ z = pio4-x;
+ w = pio4lo-y;
+ x = z+w; y = 0.0;
+ }
+ z = x*x;
+ w = z*z;
+ /* Break x^5*(T[1]+x^2*T[2]+...) into
+ * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+ * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+ */
+
+ r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
+ v = z* mad(w, mad(w, T[6], T[4]), T[2]);
+
+ s = z*x;
+ r = mad(z, mad(s, r + v, y), y);
+ r += T[0]*s;
+ w = x+r;
+ if(ix>=0x3f2ca140) {
+ v = (float)iy;
+ return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
+ }
+ if(iy==1) return w;
+ else
+ return -1.0/(x+r);
+}
+
+OVERLOADABLE float tan(float x)
+{
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_tan(x);
+
+ float y,z=0.0;
+ int n, ix;
+ float negative = x < 0.0f? -1.0f : 1.0f;
+ x = negative * x;
+
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+ ix &= 0x7fffffff;
+
+ /* tan(Inf or NaN) is NaN */
+ if (ix>=0x7f800000) return x-x; /* NaN */
+
+ /* argument reduction needed */
+ else {
+ n = __ieee754_rem_pio2f(x,&y);
+ return negative * __kernel_tanf(y,0.0f,1-((n&1)<<1)); /* 1 -- n even
+ -1 -- n odd */
+ }
+}
+
+OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+ int ix;
+ if(isinf(x) || isnan(x)) { return NAN; }
+ if(x < 0.0f) { x = -x; }
+ GEN_OCL_GET_FLOAT_WORD(ix, x);
+ if(x> 0x1.0p24) return 1.0f;
+ float m = __gen_ocl_internal_floor(x);
+ ix = (int)m;
+ m = x-m;
+ if((ix&0x1) != 0) m+=1.0f;
+ ix = __gen_ocl_internal_floor(m*4.0f);
+
+ switch(ix) {
+ case 0:
+ return __kernel_cosf(m*M_PI_F, 0.0f);
+ case 1:
+ case 2:
+ return __kernel_sinf((0.5f-m)*M_PI_F);
+ case 3:
+ case 4:
+ return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
+ case 5:
+ case 6:
+ return __kernel_sinf((m-1.5f)*M_PI_F);
+ default:
+ return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
+ }
+}
+
+OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+ float sign = 1.0f;
+ int ix;
+ if(isinf(x)) return NAN;
+ if(x < 0.0f) { x = -x; sign = -1.0f; }
+ GEN_OCL_GET_FLOAT_WORD(ix, x);
+ if(x> 0x1.0p24) return 0.0f;
+ float m = __gen_ocl_internal_floor(x);
+ ix = (int)m;
+ m = x-m;
+ if((ix&0x1) != 0) m+=1.0f;
+ ix = __gen_ocl_internal_floor(m*4.0f);
+
+ switch(ix) {
+ case 0:
+ return sign*__kernel_sinf(m*M_PI_F);
+ case 1:
+ case 2:
+ return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+ case 3:
+ case 4:
+ return -sign*__kernel_sinf((m-1.0f)*M_PI_F);
+ case 5:
+ case 6:
+ return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+ default:
+ return -sign*__kernel_sinf((2.0f-m)*M_PI_F);
+ }
+
+}
+
+OVERLOADABLE float lgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ const float
+ zero= 0.,
+ one = 1.0000000000e+00,
+ pi = 3.1415927410e+00,
+ a0 = 7.7215664089e-02,
+ a1 = 3.2246702909e-01,
+ a2 = 6.7352302372e-02,
+ a3 = 2.0580807701e-02,
+ a4 = 7.3855509982e-03,
+ a5 = 2.8905137442e-03,
+ a6 = 1.1927076848e-03,
+ a7 = 5.1006977446e-04,
+ a8 = 2.2086278477e-04,
+ a9 = 1.0801156895e-04,
+ a10 = 2.5214456400e-05,
+ a11 = 4.4864096708e-05,
+ tc = 1.4616321325e+00,
+ tf = -1.2148628384e-01,
+ tt = 6.6971006518e-09,
+ t0 = 4.8383611441e-01,
+ t1 = -1.4758771658e-01,
+ t2 = 6.4624942839e-02,
+ t3 = -3.2788541168e-02,
+ t4 = 1.7970675603e-02,
+ t5 = -1.0314224288e-02,
+ t6 = 6.1005386524e-03,
+ t7 = -3.6845202558e-03,
+ t8 = 2.2596477065e-03,
+ t9 = -1.4034647029e-03,
+ t10 = 8.8108185446e-04,
+ t11 = -5.3859531181e-04,
+ t12 = 3.1563205994e-04,
+ t13 = -3.1275415677e-04,
+ t14 = 3.3552918467e-04,
+ u0 = -7.7215664089e-02,
+ u1 = 6.3282704353e-01,
+ u2 = 1.4549225569e+00,
+ u3 = 9.7771751881e-01,
+ u4 = 2.2896373272e-01,
+ u5 = 1.3381091878e-02,
+ v1 = 2.4559779167e+00,
+ v2 = 2.1284897327e+00,
+ v3 = 7.6928514242e-01,
+ v4 = 1.0422264785e-01,
+ v5 = 3.2170924824e-03,
+ s0 = -7.7215664089e-02,
+ s1 = 2.1498242021e-01,
+ s2 = 3.2577878237e-01,
+ s3 = 1.4635047317e-01,
+ s4 = 2.6642270386e-02,
+ s5 = 1.8402845599e-03,
+ s6 = 3.1947532989e-05,
+ r1 = 1.3920053244e+00,
+ r2 = 7.2193557024e-01,
+ r3 = 1.7193385959e-01,
+ r4 = 1.8645919859e-02,
+ r5 = 7.7794247773e-04,
+ r6 = 7.3266842264e-06,
+ w0 = 4.1893854737e-01,
+ w1 = 8.3333335817e-02,
+ w2 = -2.7777778450e-03,
+ w3 = 7.9365057172e-04,
+ w4 = -5.9518753551e-04,
+ w5 = 8.3633989561e-04,
+ w6 = -1.6309292987e-03;
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+ int i, hx, ix;
+ nadj = 0;
+ hx = *(int *)&x;
+ ix = hx & 0x7fffffff;
+ if (ix >= 0x7f800000)
+ return x * x;
+ if (ix == 0)
+ return ((x + one) / zero);
+ if (ix < 0x1c800000) {
+ if (hx < 0) {
+ return -native_log(-x);
+ } else
+ return -native_log(x);
+ }
+ if (hx < 0) {
+ if (ix >= 0x4b000000)
+ return ((-x) / zero);
+ t = __gen_ocl_internal_sinpi(x);
+ if (t == zero)
+ return ((-x) / zero);
+ nadj = native_log(pi / __gen_ocl_fabs(t * x));
+ x = -x;
+ }
+ if (ix == 0x3f800000 || ix == 0x40000000)
+ r = 0;
+ else if (ix < 0x40000000) {
+ if (ix <= 0x3f666666) {
+ r = -native_log(x);
+ if (ix >= 0x3f3b4a20) {
+ y = one - x;
+ i = 0;
+ } else if (ix >= 0x3e6d3308) {
+ y = x - (tc - one);
+ i = 1;
+ } else {
+ y = x;
+ i = 2;
+ }
+ } else {
+ r = zero;
+ if (ix >= 0x3fdda618) {
+ y = (float) 2.0 - x;
+ i = 0;
+ }
+ else if (ix >= 0x3F9da620) {
+ y = x - tc;
+ i = 1;
+ }
+ else {
+ y = x - one;
+ i = 2;
+ }
+ }
+ switch (i) {
+ case 0:
+ z = y * y;
+ p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
+ p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
+ p = mad(y, p1, p2);
+ r += (p - (float) 0.5 * y);
+ break;
+ case 1:
+ z = y * y;
+ w = z * y;
+ p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
+ p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
+ p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
+ p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
+ r += (tf + p);
+ break;
+ case 2:
+ p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
+ p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
+ r += (-(float) 0.5 * y + p1 / p2);
+ }
+ } else if (ix < 0x41000000) {
+ i = (int) x;
+ t = zero;
+ y = x - (float) i;
+
+ p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
+ q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
+ r = .5f * y + p / q;
+ z = one;
+
+ switch (i) {
+ case 7:
+ z *= (y + 6.0f);
+ case 6:
+ z *= (y + 5.0f);
+ case 5:
+ z *= (y + 4.0f);
+ case 4:
+ z *= (y + 3.0f);
+ case 3:
+ z *= (y + 2.0f);
+ r += native_log(z);
+ break;
+ }
+
+ } else if (ix < 0x5c800000) {
+ t = native_log(x);
+ z = one / x;
+ y = z * z;
+ w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
+ r = (x - .5f) * (t - one) + w;
+ } else
+ r = x * (native_log(x) - one);
+ if (hx < 0)
+ r = nadj - r;
+ return r;
+}
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#define BODY \
+ const float \
+ zero= 0., \
+ one = 1.0000000000e+00, \
+ pi = 3.1415927410e+00, \
+ a0 = 7.7215664089e-02, \
+ a1 = 3.2246702909e-01, \
+ a2 = 6.7352302372e-02, \
+ a3 = 2.0580807701e-02, \
+ a4 = 7.3855509982e-03, \
+ a5 = 2.8905137442e-03, \
+ a6 = 1.1927076848e-03, \
+ a7 = 5.1006977446e-04, \
+ a8 = 2.2086278477e-04, \
+ a9 = 1.0801156895e-04, \
+ a10 = 2.5214456400e-05, \
+ a11 = 4.4864096708e-05, \
+ tc = 1.4616321325e+00, \
+ tf = -1.2148628384e-01, \
+ tt = 6.6971006518e-09, \
+ t0 = 4.8383611441e-01, \
+ t1 = -1.4758771658e-01, \
+ t2 = 6.4624942839e-02, \
+ t3 = -3.2788541168e-02, \
+ t4 = 1.7970675603e-02, \
+ t5 = -1.0314224288e-02, \
+ t6 = 6.1005386524e-03, \
+ t7 = -3.6845202558e-03, \
+ t8 = 2.2596477065e-03, \
+ t9 = -1.4034647029e-03, \
+ t10 = 8.8108185446e-04, \
+ t11 = -5.3859531181e-04, \
+ t12 = 3.1563205994e-04, \
+ t13 = -3.1275415677e-04, \
+ t14 = 3.3552918467e-04, \
+ u0 = -7.7215664089e-02, \
+ u1 = 6.3282704353e-01, \
+ u2 = 1.4549225569e+00, \
+ u3 = 9.7771751881e-01, \
+ u4 = 2.2896373272e-01, \
+ u5 = 1.3381091878e-02, \
+ v1 = 2.4559779167e+00, \
+ v2 = 2.1284897327e+00, \
+ v3 = 7.6928514242e-01, \
+ v4 = 1.0422264785e-01, \
+ v5 = 3.2170924824e-03, \
+ s0 = -7.7215664089e-02, \
+ s1 = 2.1498242021e-01, \
+ s2 = 3.2577878237e-01, \
+ s3 = 1.4635047317e-01, \
+ s4 = 2.6642270386e-02, \
+ s5 = 1.8402845599e-03, \
+ s6 = 3.1947532989e-05, \
+ r1 = 1.3920053244e+00, \
+ r2 = 7.2193557024e-01, \
+ r3 = 1.7193385959e-01, \
+ r4 = 1.8645919859e-02, \
+ r5 = 7.7794247773e-04, \
+ r6 = 7.3266842264e-06, \
+ w0 = 4.1893854737e-01, \
+ w1 = 8.3333335817e-02, \
+ w2 = -2.7777778450e-03, \
+ w3 = 7.9365057172e-04, \
+ w4 = -5.9518753551e-04, \
+ w5 = 8.3633989561e-04, \
+ w6 = -1.6309292987e-03; \
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w; \
+ int i, hx, ix; \
+ nadj = 0; \
+ hx = *(int *)&x; \
+ *signgamp = 1; \
+ ix = hx & 0x7fffffff; \
+ if (ix >= 0x7f800000) \
+ return x * x; \
+ if (ix == 0) \
+ return ((x + one) / zero); \
+ if (ix < 0x1c800000) { \
+ if (hx < 0) { \
+ *signgamp = -1; \
+ return -native_log(-x); \
+ } else \
+ return -native_log(x); \
+ } \
+ if (hx < 0) { \
+ if (ix >= 0x4b000000) \
+ return ((-x) / zero); \
+ t = __gen_ocl_internal_sinpi(x); \
+ if (t == zero) \
+ return ((-x) / zero); \
+ nadj = native_log(pi / __gen_ocl_fabs(t * x)); \
+ if (t < zero) \
+ *signgamp = -1; \
+ x = -x; \
+ } \
+ if (ix == 0x3f800000 || ix == 0x40000000) \
+ r = 0; \
+ else if (ix < 0x40000000) { \
+ if (ix <= 0x3f666666) { \
+ r = -native_log(x); \
+ if (ix >= 0x3f3b4a20) { \
+ y = one - x; \
+ i = 0; \
+ } else if (ix >= 0x3e6d3308) { \
+ y = x - (tc - one); \
+ i = 1; \
+ } else { \
+ y = x; \
+ i = 2; \
+ } \
+ } else { \
+ r = zero; \
+ if (ix >= 0x3fdda618) { \
+ y = (float) 2.0 - x; \
+ i = 0; \
+ } \
+ else if (ix >= 0x3F9da620) { \
+ y = x - tc; \
+ i = 1; \
+ } \
+ else { \
+ y = x - one; \
+ i = 2; \
+ } \
+ } \
+ switch (i) { \
+ case 0: \
+ z = y * y; \
+ p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0); \
+ p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1); \
+ p = mad(y, p1, p2); \
+ r = r - mad(y, 0.5f, -p); \
+ break; \
+ case 1: \
+ z = y * y; \
+ w = z * y; \
+ p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0); \
+ p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1); \
+ p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2); \
+ p = z * p1 + mad(w, mad(y, p3, p2), -tt); \
+ r += (tf + p); \
+ break; \
+ case 2: \
+ p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0); \
+ p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one); \
+ r = r + mad(y, -0.5f, p1 / p2); \
+ } \
+ } else if (ix < 0x41000000) { \
+ i = (int) x; \
+ t = zero; \
+ y = x - (float) i; \
+ p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0); \
+ q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one); \
+ r = mad(y, 0.5f, p / q); \
+ z = one; \
+ switch (i) { \
+ case 7: \
+ z *= (y + (float) 6.0); \
+ case 6: \
+ z *= (y + (float) 5.0); \
+ case 5: \
+ z *= (y + (float) 4.0); \
+ case 4: \
+ z *= (y + (float) 3.0); \
+ case 3: \
+ z *= (y + (float) 2.0); \
+ r += native_log(z); \
+ break; \
+ } \
+ \
+ } else if (ix < 0x5c800000) { \
+ t = native_log(x); \
+ z = one / x; \
+ y = z * z; \
+ w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0); \
+ r = (x - .5f) * (t - one) + w; \
+ } else \
+ r = x * (native_log(x) - one); \
+ if (hx < 0) \
+ r = nadj - r; \
+ return r;
+OVERLOADABLE float lgamma_r(float x, int *signgamp) { BODY; }
+#undef BODY
+
+OVERLOADABLE float log1p(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_log1p(x);
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ const float
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ two25 = 3.355443200e+07, /* 0x4c000000 */
+ Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
+ Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
+ Lp3 = 2.8571429849e-01, /* 3E924925 */
+ Lp4 = 2.2222198546e-01; /* 3E638E29 */
+ const float zero = 0.0;
+ float hfsq,f,c,s,z,R,u;
+ int k,hx,hu,ax;
+ union {float f; unsigned i;} un;
+ un.f = x; hx = un.i;
+ ax = hx&0x7fffffff;
+
+ k = 1;
+ if (hx < 0x3ed413d7) { /* x < 0.41422 */
+ if(ax>=0x3f800000) { /* x <= -1.0 */
+ if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
+ else return (x-x)/(x-x); /* log1p(x<-1)=NaN */
+ }
+ if(ax<0x31000000) { /* |x| < 2**-29 */
+ if(two25+x>zero /* raise inexact */
+ &&ax<0x24800000) /* |x| < 2**-54 */
+ return x;
+ else
+ return x - x*x*(float)0.5;
+ }
+ if(hx>0||hx<=((int)0xbe95f61f)) {
+ k=0;f=x;hu=1;} /* -0.2929<x<0.41422 */
+ }
+ if (hx >= 0x7f800000) return x+x;
+ if(k!=0) {
+ if(hx<0x5a000000) {
+ u = (float)1.0+x;
+
+ un.f = u; hu = un.i;
+ k = (hu>>23)-127;
+ /* correction term */
+ c = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
+ c /= u;
+ } else {
+ u = x;
+ un.f = u; hu = un.i;
+ k = (hu>>23)-127;
+ c = 0;
+ }
+ hu &= 0x007fffff;
+ if(hu<0x3504f7) {
+ un.i = hu|0x3f800000; u = un.f;/* normalize u */
+ } else {
+ k += 1;
+ un.i = hu|0x3f000000; u = un.f; /* normalize u/2 */
+ hu = (0x00800000-hu)>>2;
+ }
+ f = u-(float)1.0;
+ }
+ hfsq=(float)0.5*f*f;
+ if(hu==0)
+ { /* |f| < 2**-20 */
+ if(f==zero)
+ {
+ if(k==0) return zero;
+ else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
+ }
+ R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
+ if(k==0) return f-R; else
+ return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
+ }
+ s = f/((float)2.0+f);
+ z = s*s;
+ R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
+ if(k==0)
+ return f + mad(hfsq + R, s, -hfsq);
+ else
+ return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
+}
+
+OVERLOADABLE float logb(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_logb(x);
+
+ union {float f; unsigned i;} u;
+ u.f = x;
+ int e = ((u.i & 0x7f800000) >> 23);
+ float r1 = e-127;
+ float r2 = -INFINITY;
+ float r3 = x*x;
+ /* sub normal or +/-0 */
+ float r = e == 0 ? r2 : r1;
+ /* inf & nan */
+ return e == 0xff ? r3 : r;
+}
+
+OVERLOADABLE int ilogb(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_ilogb(x);
+
+ union { int i; float f; } u;
+ if (isnan(x))
+ return FP_ILOGBNAN;
+ if (isinf(x))
+ return 0x7FFFFFFF;
+ u.f = x;
+ u.i &= 0x7fffffff;
+ if (u.i == 0)
+ return FP_ILOGB0;
+ if (u.i >= 0x800000)
+ return (u.i >> 23) - 127;
+ int r = -126;
+ int a = u.i & 0x7FFFFF;
+ while(a < 0x800000) {
+ a <<= 1;
+ r --;
+ }
+ return r;
+}
+OVERLOADABLE float nan(uint code) {
+ return NAN;
+}
+OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+ float sign = 1.0f;
+ int ix;
+ if(isinf(x)) return NAN;
+ if(x < 0.0f) { x = -x; sign = -1.0f; }
+ GEN_OCL_GET_FLOAT_WORD(ix, x);
+ if(x> 0x1.0p24) return 0.0f;
+ float m = __gen_ocl_internal_floor(x);
+ ix = (int)m;
+ m = x-m;
+ int n = __gen_ocl_internal_floor(m*4.0f);
+ if(m == 0.5f) {
+ return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
+ }
+ if(m == 0.0f) {
+ return (ix&0x1) == 0 ? 0.0f : -0.0f;
+ }
+
+ switch(n) {
+ case 0:
+ return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
+ case 1:
+ return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+ case 2:
+ return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+ default:
+ return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
+ }
+}
+OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+ /* copied from fdlibm */
+ const unsigned
+ B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
+ B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
+
+ const float
+ C = 5.4285717010e-01, /* 19/35 = 0x3f0af8b0 */
+ D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
+ E = 1.4142856598e+00, /* 99/70 = 0x3fb50750 */
+ F = 1.6071428061e+00, /* 45/28 = 0x3fcdb6db */
+ G = 3.5714286566e-01; /* 5/14 = 0x3eb6db6e */
+
+ float r,s,t, w;
+ int hx;
+ uint sign;
+ uint high;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ sign=hx&0x80000000; /* sign= sign(x) */
+ hx ^=sign;
+ if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
+ if(hx==0)
+ return(x); /* cbrt(0) is itself */
+
+ GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
+ /* rough cbrt to 5 bits */
+ if(hx<0x00800000) /* subnormal number */
+ {
+ //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
+ //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
+ t = (sign = 0) ? 0.0f : -0.0f;
+ return t;
+ }
+ else
+ GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
+
+
+ /* new cbrt to 23 bits */
+ r=t*t/x;
+ s=mad(r, t, C);
+ t*=G+F/(s+E+D/s);
+ /* one step newton iteration to 53 bits with error less than 0.667 ulps */
+ s=t*t; /* t*t is exact */
+ r=x/s;
+ w=t+t;
+ r=(r-t)/(w+r); /* r-s is exact */
+ t=mad(t, r, t);
+
+ /* retore the sign bit */
+ GEN_OCL_GET_FLOAT_WORD(high,t);
+ GEN_OCL_SET_FLOAT_WORD(t,high|sign);
+ return(t);
+}
+
+#define BODY \
+ *cosval = cos(x); \
+ return sin(x);
+
+OVERLOADABLE float sincos(float x, float *cosval) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_sincos(x, cosval);
+ BODY;
+}
+#undef BODY
+
+INLINE float __gen_ocl_asin_util(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ float
+ pS0 = 1.66666666666666657415e-01,
+ pS1 = -3.25565818622400915405e-01,
+ pS2 = 2.01212532134862925881e-01,
+ pS3 = -4.00555345006794114027e-02,
+ pS4 = 7.91534994289814532176e-04,
+ qS1 = -2.40339491173441421878e+00,
+ qS2 = 2.02094576023350569471e+00,
+ qS3 = -6.88283971605453293030e-01,
+ qS4 = 7.70381505559019352791e-02;
+
+ float t = x*x;
+ float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
+ float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
+ float w = p / q;
+ return mad(x, w, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+ uint ix;
+ union { uint i; float f; } u;
+ u.f = x;
+ ix = u.i & 0x7fffffff;
+ if(ix == 0x3f800000) {
+ return x * M_PI_2_F; /* asin(|1|)=+-pi/2 with inexact */
+ }
+ if(ix > 0x3f800000) { /* |x|>= 1 */
+ return NAN; /* asin(|x|>1) is NaN */
+ }
+
+ if(ix < 0x32000000) { /* if |x| < 2**-27 */
+ if(HUGE_VALF + x > FLT_ONE) return x; /* return x with inexact if x!=0*/
+ }
+
+ if(x < -0.5) {
+ return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
+ } else if(x > 0.5) {
+ return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
+ } else {
+ return __gen_ocl_asin_util(x);
+ }
+}
+OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+ return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+ if(x > 0.5)
+ return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
+ else
+ return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+ return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+__constant float atanhi[4] = {
+ 4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+ 7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+ 9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+ 1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+};
+__constant float atanlo[4] = {
+ 5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+ 3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+ 3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+ 7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+};
+
+OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+ /* copied from fdlibm */
+ float aT[11];
+ aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
+ aT[1] = -2.0000000298e-01; /* 0xbe4ccccd */
+ aT[2] = 1.4285714924e-01; /* 0x3e124925 */
+ aT[3] = -1.1111110449e-01; /* 0xbde38e38 */
+ aT[4] = 9.0908870101e-02; /* 0x3dba2e6e */
+ aT[5] = -7.6918758452e-02; /* 0xbd9d8795 */
+ aT[6] = 6.6610731184e-02; /* 0x3d886b35 */
+ const float one = 1.0, huge = 1.0e30;
+
+ float w,s1,s2,z;
+ int ix,hx,id;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix>=0x50800000) { /* if |x| >= 2^34 */
+ if(ix>0x7f800000)
+ return x+x; /* NaN */
+ if(hx>0) return atanhi[3]+atanlo[3];
+ else return -atanhi[3]-atanlo[3];
+ } if (ix < 0x3ee00000) { /* |x| < 0.4375 */
+ if (ix < 0x31000000) { /* |x| < 2^-29 */
+ if(huge+x>one) return x; /* raise inexact */
+ }
+ id = -1;
+ } else {
+ x = __gen_ocl_fabs(x);
+ if (ix < 0x3f980000) { /* |x| < 1.1875 */
+ if (ix < 0x3f300000) { /* 7/16 <=|x|<11/16 */
+ id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
+ } else { /* 11/16<=|x|< 19/16 */
+ id = 1; x = (x-one)/(x+one);
+ }
+ } else {
+ if (ix < 0x401c0000) { /* |x| < 2.4375 */
+ id = 2; x = (x-(float)1.5)/(one+(float)1.5*x);
+ } else { /* 2.4375 <= |x| < 2^66 */
+ id = 3; x = -(float)1.0/x;
+ }
+ }}
+ /* end of argument reduction */
+ z = x*x;
+ w = z*z;
+ /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+ s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
+ s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
+ if (id<0) return x - x*(s1+s2);
+ else {
+ z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
+ return (hx<0)? -z:z;
+ }
+
+}
+OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+ return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+
+// XXX work-around PTX profile
+OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
+OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+ /* copied from fdlibm */
+ float z;
+ int k,m,hx,hy,ix,iy;
+ const float
+ tiny = 1.0e-30,
+ zero = 0.0,
+ pi_o_4 = 7.8539818525e-01, /* 0x3f490fdb */
+ pi_o_2 = 1.5707963705e+00, /* 0x3fc90fdb */
+ pi = 3.1415927410e+00, /* 0x40490fdb */
+ pi_lo = -8.7422776573e-08; /* 0xb3bbbd2e */
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ iy = hy&0x7fffffff;
+
+ if((ix>0x7f800000)||
+ (iy>0x7f800000)) /* x or y is NaN */
+ return x+y;
+ if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y); /* x=1.0 */
+ m = ((hy>>31)&1)|((hx>>30)&2); /* 2*sign(x)+sign(y) */
+
+ /* when y = 0 */
+ if(iy==0) {
+ switch(m) {
+ case 0:
+ case 1: return y; /* atan(+-0,+anything)=+-0 */
+ case 2: return pi+tiny;/* atan(+0,-anything) = pi */
+ case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
+ }
+ }
+ /* when x = 0 */
+ if(ix==0) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+
+ /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
+ if(ix <= 0x7fffff && iy <= 0x7fffff) {
+ x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
+ y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
+ }
+
+ /* when x is INF */
+ if(ix==0x7f800000) {
+ if(iy==0x7f800000) {
+ switch(m) {
+ case 0: return pi_o_4+tiny;/* atan(+INF,+INF) */
+ case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
+ case 2: return (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
+ case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
+ }
+ } else {
+ switch(m) {
+ case 0: return zero ; /* atan(+...,+INF) */
+ case 1: return -zero ; /* atan(-...,+INF) */
+ case 2: return pi+tiny ; /* atan(+...,-INF) */
+ case 3: return -pi-tiny ; /* atan(-...,-INF) */
+ }
+ }
+ }
+ /* when y is INF */
+ if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+
+ /* compute y/x */
+ k = (iy-ix)>>23;
+ if(k > 60) z=pi_o_2+(float)0.5*pi_lo; /* |y/x| > 2**60 */
+ else if(hx<0&&k<-60) z=0.0; /* |y|/x < -2**60 */
+ else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
+ switch (m) {
+ case 0: return z ; /* atan(+,+) */
+ case 1: {
+ uint zh;
+ GEN_OCL_GET_FLOAT_WORD(zh,z);
+ GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
+ }
+ return z ; /* atan(-,+) */
+ case 2: return pi-(z-pi_lo);/* atan(+,-) */
+ default: /* case 3 */
+ return (z-pi_lo)-pi;/* atan(-,-) */
+ }
+}
+
+OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+ return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+}
+OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
+OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+OVERLOADABLE float __gen_ocl_internal_round(float x) {
+ float y = __gen_ocl_rndz(x);
+ if (__gen_ocl_fabs(x - y) >= 0.5f)
+ y += __gen_ocl_internal_copysign(1.f, x);
+ return y;
+}
+OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); }
+OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+ return __gen_ocl_rnde(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+ float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
+ u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
+ twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ one = 1.0,
+ huge = 1.0e+30,
+ P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+ P2 = -2.7777778450e-03; /* 0xbb360b61 */
+ float y,hi=0.0,lo=0.0,c,t;
+ int k=0,xsb;
+ unsigned hx;
+ float ln2HI_0 = 6.9313812256e-01; /* 0x3f317180 */
+ float ln2HI_1 = -6.9313812256e-01; /* 0xbf317180 */
+ float ln2LO_0 = 9.0580006145e-06; /* 0x3717f7d1 */
+ float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
+ float half_0 = 0.5;
+ float half_1 = -0.5;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ xsb = (hx>>31)&1; /* sign bit of x */
+ hx &= 0x7fffffff; /* high word of |x| */
+
+ /* filter out non-finite argument */
+ if(hx >= 0x42b17218) { /* if |x|>=88.721... */
+ if(hx>0x7f800000)
+ return x+x; /* NaN */
+ if(hx==0x7f800000)
+ return (xsb==0)? x:0.0; /* exp(+-inf)={inf,0} */
+ if(x > o_threshold) return huge*huge; /* overflow */
+ if(x < u_threshold) return twom100*twom100; /* underflow */
+ }
+ /* argument reduction */
+ if(hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
+ if(hx < 0x3F851592) { /* and |x| < 1.5 ln2 */
+ hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
+ } else {
+ float tmp = xsb == 1 ? half_1 : half_0;
+ k = ivln2*x+tmp;
+ t = k;
+ hi = x - t*ln2HI_0; /* t*ln2HI is exact here */
+ lo = t*ln2LO_0;
+ }
+ x = hi - lo;
+ }
+ else if(hx < 0x31800000) { /* when |x|<2**-28 */
+ if(huge+x>one) return one+x;/* trigger inexact */
+ }
+ else k = 0;
+
+ /* x is now in primary range */
+ t = x*x;
+ c = x - t*(P1+t*P2);
+ if(k==0)
+ return one-((x*c)/(c-(float)2.0)-x);
+ else
+ y = one-((lo-(x*c)/((float)2.0-c))-hi);
+ if(k >= -125) {
+ unsigned hy;
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23)); /* add k to y's exponent */
+ return y;
+ } else {
+ unsigned hy;
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+ return y*twom100;
+ }
+}
+
+/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+/*...*/
+const float
+tiny = 1.0e-30,
+half_val= 5.0000000000e-01, /* 0x3F000000 */
+one = 1.0000000000e+00, /* 0x3F800000 */
+two = 2.0000000000e+00, /* 0x40000000 */
+ /* c = (subfloat)0.84506291151 */
+erx = 8.4506291151e-01, /* 0x3f58560b */
+/*
+ * Coefficients for approximation to erf on [0,0.84375]
+ */
+efx = 1.2837916613e-01, /* 0x3e0375d4 */
+efx8= 1.0270333290e+00, /* 0x3f8375d4 */
+pp0 = 1.2837916613e-01, /* 0x3e0375d4 */
+pp1 = -3.2504209876e-01, /* 0xbea66beb */
+pp2 = -2.8481749818e-02, /* 0xbce9528f */
+pp3 = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4 = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1 = 3.9791721106e-01, /* 0x3ecbbbce */
+qq2 = 6.5022252500e-02, /* 0x3d852a63 */
+qq3 = 5.0813062117e-03, /* 0x3ba68116 */
+qq4 = 1.3249473704e-04, /* 0x390aee49 */
+qq5 = -3.9602282413e-06, /* 0xb684e21a */
+/*
+ * Coefficients for approximation to erf in [0.84375,1.25]
+ */
+pa0 = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1 = 4.1485610604e-01, /* 0x3ed46805 */
+pa2 = -3.7220788002e-01, /* 0xbebe9208 */
+pa3 = 3.1834661961e-01, /* 0x3ea2fe54 */
+pa4 = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5 = 3.5478305072e-02, /* 0x3d1151b3 */
+pa6 = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1 = 1.0642088205e-01, /* 0x3dd9f331 */
+qa2 = 5.4039794207e-01, /* 0x3f0a5785 */
+qa3 = 7.1828655899e-02, /* 0x3d931ae7 */
+qa4 = 1.2617121637e-01, /* 0x3e013307 */
+qa5 = 1.3637083583e-02, /* 0x3c5f6e13 */
+qa6 = 1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to erfc in [1.25,1/0.35]
+ */ra0 = -9.8649440333e-03, /* 0xbc21a093 */
+ra1 = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2 = -1.0558626175e+01, /* 0xc128f022 */
+ra3 = -6.2375331879e+01, /* 0xc2798057 */
+ra4 = -1.6239666748e+02, /* 0xc322658c */
+ra5 = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6 = -8.1287437439e+01, /* 0xc2a2932b */
+ra7 = -9.8143291473e+00, /* 0xc11d077e */
+sa1 = 1.9651271820e+01, /* 0x419d35ce */
+sa2 = 1.3765776062e+02, /* 0x4309a863 */
+sa3 = 4.3456588745e+02, /* 0x43d9486f */
+sa4 = 6.4538726807e+02, /* 0x442158c9 */
+sa5 = 4.2900814819e+02, /* 0x43d6810b */
+sa6 = 1.0863500214e+02, /* 0x42d9451f */
+sa7 = 6.5702495575e+00, /* 0x40d23f7c */
+sa8 = -6.0424413532e-02, /* 0xbd777f97 */
+/*
+ * Coefficients for approximation to erfc in [1/.35,28]
+ */
+rb0 = -9.8649431020e-03, /* 0xbc21a092 */
+rb1 = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2 = -1.7757955551e+01, /* 0xc18e104b */
+rb3 = -1.6063638306e+02, /* 0xc320a2ea */
+rb4 = -6.3756646729e+02, /* 0xc41f6441 */
+rb5 = -1.0250950928e+03, /* 0xc480230b */
+rb6 = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1 = 3.0338060379e+01, /* 0x41f2b459 */
+sb2 = 3.2579251099e+02, /* 0x43a2e571 */
+sb3 = 1.5367296143e+03, /* 0x44c01759 */
+sb4 = 3.1998581543e+03, /* 0x4547fdbb */
+sb5 = 2.5530502930e+03, /* 0x451f90ce */
+sb6 = 4.7452853394e+02, /* 0x43ed43a7 */
+sb7 = -2.2440952301e+01; /* 0xc1b38712 */
+
+ int hx,ix,i;
+ float R,S,P,Q,s,y,z,r;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix>=0x7f800000) { /* erf(nan)=nan */
+ i = ((unsigned int)hx>>31)<<1;
+ return (float)(1-i)+one/x; /* erf(+-inf)=+-1 */
+ }
+
+ if(ix < 0x3f580000) { /* |x|<0.84375 */
+ if(ix < 0x31800000) { /* |x|<2**-28 */
+ if (ix < 0x04000000)
+ /*avoid underflow */
+ return (float)0.125*((float)8.0*x+efx8*x);
+ return x + efx*x;
+ }
+ z = x*x;
+ r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+ s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
+ y = r / s;
+ return mad(x, y, x);
+ }
+ if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
+ s = __gen_ocl_internal_fabs(x)-one;
+ P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+ Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
+ if(hx>=0) return erx + P/Q; else return -erx - P/Q;
+ }
+ if (ix >= 0x40c00000) { /* inf>|x|>=6 */
+ if(hx>=0) return one-tiny; else return tiny-one;
+ }
+ x = __gen_ocl_internal_fabs(x);
+ s = one/(x*x);
+ if(ix< 0x4036DB6E) { /* |x| < 1/0.35 */
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
+ } else { /* |x| >= 1/0.35 */
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
+ }
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
+ r = __gen_ocl_internal_exp(-z*z-(float)0.5625)*__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+ if(hx>=0) return one-r/x; else return r/x-one;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+/*...*/
+const float
+tiny = 1.0e-30,
+half_val= 5.0000000000e-01, /* 0x3F000000 */
+one = 1.0000000000e+00, /* 0x3F800000 */
+two = 2.0000000000e+00, /* 0x40000000 */
+ /* c = (subfloat)0.84506291151 */
+erx = 8.4506291151e-01, /* 0x3f58560b */
+/*
+ * Coefficients for approximation to erf on [0,0.84375]
+ */
+efx = 1.2837916613e-01, /* 0x3e0375d4 */
+efx8= 1.0270333290e+00, /* 0x3f8375d4 */
+pp0 = 1.2837916613e-01, /* 0x3e0375d4 */
+pp1 = -3.2504209876e-01, /* 0xbea66beb */
+pp2 = -2.8481749818e-02, /* 0xbce9528f */
+pp3 = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4 = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1 = 3.9791721106e-01, /* 0x3ecbbbce */
+qq2 = 6.5022252500e-02, /* 0x3d852a63 */
+qq3 = 5.0813062117e-03, /* 0x3ba68116 */
+qq4 = 1.3249473704e-04, /* 0x390aee49 */
+qq5 = -3.9602282413e-06, /* 0xb684e21a */
+/*
+ * Coefficients for approximation to erf in [0.84375,1.25]
+ */
+pa0 = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1 = 4.1485610604e-01, /* 0x3ed46805 */
+pa2 = -3.7220788002e-01, /* 0xbebe9208 */
+pa3 = 3.1834661961e-01, /* 0x3ea2fe54 */
+pa4 = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5 = 3.5478305072e-02, /* 0x3d1151b3 */
+pa6 = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1 = 1.0642088205e-01, /* 0x3dd9f331 */
+qa2 = 5.4039794207e-01, /* 0x3f0a5785 */
+qa3 = 7.1828655899e-02, /* 0x3d931ae7 */
+qa4 = 1.2617121637e-01, /* 0x3e013307 */
+qa5 = 1.3637083583e-02, /* 0x3c5f6e13 */
+qa6 = 1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to erfc in [1.25,1/0.35]
+ */ra0 = -9.8649440333e-03, /* 0xbc21a093 */
+ra1 = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2 = -1.0558626175e+01, /* 0xc128f022 */
+ra3 = -6.2375331879e+01, /* 0xc2798057 */
+ra4 = -1.6239666748e+02, /* 0xc322658c */
+ra5 = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6 = -8.1287437439e+01, /* 0xc2a2932b */
+ra7 = -9.8143291473e+00, /* 0xc11d077e */
+sa1 = 1.9651271820e+01, /* 0x419d35ce */
+sa2 = 1.3765776062e+02, /* 0x4309a863 */
+sa3 = 4.3456588745e+02, /* 0x43d9486f */
+sa4 = 6.4538726807e+02, /* 0x442158c9 */
+sa5 = 4.2900814819e+02, /* 0x43d6810b */
+sa6 = 1.0863500214e+02, /* 0x42d9451f */
+sa7 = 6.5702495575e+00, /* 0x40d23f7c */
+sa8 = -6.0424413532e-02, /* 0xbd777f97 */
+/*
+ * Coefficients for approximation to erfc in [1/.35,28]
+ */
+rb0 = -9.8649431020e-03, /* 0xbc21a092 */
+rb1 = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2 = -1.7757955551e+01, /* 0xc18e104b */
+rb3 = -1.6063638306e+02, /* 0xc320a2ea */
+rb4 = -6.3756646729e+02, /* 0xc41f6441 */
+rb5 = -1.0250950928e+03, /* 0xc480230b */
+rb6 = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1 = 3.0338060379e+01, /* 0x41f2b459 */
+sb2 = 3.2579251099e+02, /* 0x43a2e571 */
+sb3 = 1.5367296143e+03, /* 0x44c01759 */
+sb4 = 3.1998581543e+03, /* 0x4547fdbb */
+sb5 = 2.5530502930e+03, /* 0x451f90ce */
+sb6 = 4.7452853394e+02, /* 0x43ed43a7 */
+sb7 = -2.2440952301e+01; /* 0xc1b38712 */
+ int hx,ix;
+ float R,S,P,Q,s,y,z,r;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix>=0x7f800000) { /* erfc(nan)=nan */
+ /* erfc(+-inf)=0,2 */
+ return (float)(((unsigned int)hx>>31)<<1)+one/x;
+ }
+
+ if(ix < 0x3f580000) { /* |x|<0.84375 */
+ if(ix < 0x23800000) /* |x|<2**-56 */
+ return one-x;
+ z = x*x;
+ r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+ s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
+ y = r/s;
+ if(hx < 0x3e800000) { /* x<1/4 */
+ return one-(x+x*y);
+ } else {
+ r = x*y;
+ r += (x-half_val);
+ return half_val - r ;
+ }
+ }
+ if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
+ s = __gen_ocl_internal_fabs(x)-one;
+ P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+ Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
+ if(hx>=0) {
+ z = one-erx; return z - P/Q;
+ } else {
+ z = erx+P/Q; return one+z;
+ }
+ }
+ if (ix < 0x41e00000) { /* |x|<28 */
+ x = __gen_ocl_internal_fabs(x);
+ s = one/(x*x);
+ if(ix< 0x4036DB6D) { /* |x| < 1/.35 ~ 2.857143*/
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
+ } else { /* |x| >= 1/.35 ~ 2.857143 */
+ if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
+ R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+ S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+ sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
+ }
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
+ r = __gen_ocl_internal_exp(-z*z-(float)0.5625)*
+ __gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+ if(hx>0) {
+ float ret = r/x;
+ return ret;
+ } else
+ return two-r/x;
+ } else {
+ if(hx>0) {
+ return tiny*tiny;
+ } else
+ return two-tiny;
+ }
+}
+
+OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+ //return x-y*__gen_ocl_rndz(x/y);
+ float one = 1.0;
+ float Zero[2];
+ int n,hx,hy,hz,ix,iy,sx,i;
+ Zero[0] = 0.0;
+ Zero[1] = -0.0;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ sx = hx&0x80000000; /* sign of x */
+ hx ^=sx; /* |x| */
+ hy &= 0x7fffffff; /* |y| */
+ /* purge off exception values */
+ if(hy==0||(hx>=0x7f800000)|| /* y=0,or x not finite */
+ (hy>0x7f800000)) /* or y is NaN */
+ return (x*y)/(x*y);
+ if(hx<hy) return x; /* |x|<|y| return x */
+ if(hx==hy)
+ return Zero[(unsigned)sx>>31]; /* |x|=|y| return x*0*/
+
+ /* determine ix = ilogb(x) */
+ if(hx<0x00800000) { /* subnormal x */
+ for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+ } else ix = (hx>>23)-127;
+
+ /* determine iy = ilogb(y) */
+ if(hy<0x00800000) { /* subnormal y */
+ for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+ } else iy = (hy>>23)-127;
+
+ /* set up {hx,lx}, {hy,ly} and align y to x */
+ if(ix >= -126)
+ hx = 0x00800000|(0x007fffff&hx);
+ else { /* subnormal x, shift x to normal */
+ n = -126-ix;
+ hx = hx<<n;
+ }
+ if(iy >= -126)
+ hy = 0x00800000|(0x007fffff&hy);
+ else { /* subnormal y, shift y to normal */
+ n = -126-iy;
+ hy = hy<<n;
+ }
+ /* fix point fmod */
+ n = ix - iy;
+ while(n--) {
+ hz=hx-hy;
+ if(hz<0){hx = hx+hx;}
+ else {
+ if(hz==0) /* return sign(x)*0 */
+ return Zero[(unsigned)sx>>31];
+ hx = hz+hz;
+ }
+ }
+ hz=hx-hy;
+ if(hz>=0) {hx=hz;}
+
+ /* convert back to floating value and restore the sign */
+ if(hx==0) /* return sign(x)*0 */
+ return Zero[(unsigned)sx>>31];
+ while(hx<0x00800000) { /* normalize x */
+ hx = hx+hx;
+ iy -= 1;
+ }
+ if(iy>= -126) { /* normalize output */
+ hx = ((hx-0x00800000)|((iy+127)<<23));
+ GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+ } else { /* subnormal output */
+ n = -126 - iy;
+ hx >>= n;
+ GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+ x *= one; /* create necessary signal */
+ }
+ return x; /* exact output */
+}
+
+OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+ //return __gen_ocl_pow(M_E_F, x) - 1;
+ float Q1 = -3.3333335072e-02, /* 0xbd088889 */
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ one = 1.0,
+ o_threshold= 8.8721679688e+01; /* 0x42b17180 */
+ float y,hi,lo,c,t,e,hxs,hfx,r1;
+ int k,xsb;
+ int hx;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ xsb = hx&0x80000000;
+ /* sign bit of x */
+ //if(xsb==0)
+ //y=x;
+ //else
+ //y= -x; /* y = |x| */
+ y = __gen_ocl_internal_fabs(x);
+ hx &= 0x7fffffff; /* high word of |x| */
+ /* filter out huge and non-finite argument */
+ if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */
+ if(hx >= 0x42b17218) { /* if |x|>=88.721... */
+ if(hx>0x7f800000)
+ return x+x; /* NaN */
+ if(hx==0x7f800000)
+ return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+ if(x > o_threshold)
+ return huge*huge; /* overflow */
+ }
+ if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+ if(x+tiny<(float)0.0) /* raise inexact */
+ return tiny-one; /* return -1 */
+ }
+ }
+ /* argument reduction */
+ if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */
+ if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+ if(xsb==0){
+ hi = x - ln2_hi; lo = ln2_lo; k = 1;
+ } else {
+ hi = x + ln2_hi; lo = -ln2_lo; k = -1;
+ }
+ } else {
+ k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+ t = k;
+ hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+ lo = t*ln2_lo;
+ }
+ x = hi - lo;
+ c = (hi-x)-lo;
+ } else if(hx < 0x33000000) { /* when |x|<2**-25, return x */
+ //t = huge+x; /* return x with inexact flags when x!=0 */
+ //return x - (t-(huge+x));
+ return x;
+ } else k = 0;
+ /* x is now in primary range */
+ hfx = (float)0.5*x;
+ hxs = x*hfx;
+ r1 = one+hxs*(Q1+hxs*Q2);
+ t = (float)3.0-r1*hfx;
+ e = hxs*((r1-t)/((float)6.0 - x*t));
+ if(k==0)
+ return x - (x*e-hxs); /* c is 0 */
+ else{
+ e = (x*(e-c)-c);
+ e -= hxs;
+ if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+ if(k==1){
+ if(x < (float)-0.25)
+ return -(float)2.0*(e-(x+(float)0.5));
+ else
+ return (one+(float)2.0*(x-e));
+ }
+ if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
+ int i;
+ y = one-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ return y-one;
+ }
+ t = one;
+ if(k<23) {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+ y = t-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ } else {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */
+ y = x-(e+t);
+ y += one;
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ }
+ }
+ return y;
+}
+
+OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+ //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+ float one = 1.0,
+ ln2 = 6.9314718246e-01;/* 0x3f317218 */
+ float t;
+ int hx;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ if(hx<0x3f800000) { /* x < 1 */
+ return (x-x)/(x-x);
+ } else if(hx >=0x4d800000) { /* x > 2**28 */
+ if(hx >=0x7f800000) {/* x is inf of NaN */
+ return x+x;
+ } else
+ return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+ } else if (hx==0x3f800000) {
+ return 0.0; /* acosh(1) = 0 */
+ } else if (hx > 0x40000000) { /* 2**28 > x > 2 */
+ t=x*x;
+ return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
+ } else { /* 1<x<2 */
+ t = x-one;
+ return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+ }
+}
+
+OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+ //return native_log(x + native_sqrt(x * x + 1));
+ float one = 1.0000000000e+00, /* 0x3F800000 */
+ ln2 = 6.9314718246e-01, /* 0x3f317218 */
+ huge= 1.0000000000e+30;
+ float w;
+ int hx,ix;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix< 0x38000000) { /* |x|<2**-14 */
+ if(huge+x>one) return x; /* return x inexact except 0 */
+ }
+ if(ix>0x47000000) {/* |x| > 2**14 */
+ if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+ w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+ } else {
+ float xa = __gen_ocl_internal_fabs(x);
+ if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+ w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
+ } else { /* 2.0 > |x| > 2**-14 */
+ float t = xa*xa;
+ w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+ }
+ }
+ return __gen_ocl_internal_copysign(w, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+ //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+ float one = 1.0,
+ shuge = 1.0e37;
+ float t,w,h;
+ int ix,jx;
+ GEN_OCL_GET_FLOAT_WORD(jx,x);
+ ix = jx&0x7fffffff;
+ /* x is INF or NaN */
+ if(ix>=0x7f800000) return x+x;
+ h = 0.5;
+ if (jx<0) h = -h;
+ /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+ if (ix < 0x41b00000) { /* |x|<22 */
+ if (ix<0x31800000) /* |x|<2**-28 */
+ if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+ t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+ if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+ return h*(t+t/(t+one));
+ }
+ /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+ if (ix < 0x42b17180) return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+ /* |x| in [log(maxdouble), overflowthresold] */
+ if (ix<=0x42b2d4fc) {
+ w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+ t = h*w;
+ return t*w;
+ }
+ /* |x| > overflowthresold, sinh(x) overflow */
+ return x*shuge;
+}
+
+OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+ //float y = native_exp(-2 * x);
+ //return (1 - y) / (1 + y);
+ float one=1.0, two=2.0, tiny = 1.0e-30;
+ float t,z;
+ int jx,ix;
+ GEN_OCL_GET_FLOAT_WORD(jx,x);
+ ix = jx&0x7fffffff;
+ /* x is INF or NaN */
+ if(ix>=0x7f800000) {
+ if (jx>=0)
+ return one/x+one; /* tanh(+-inf)=+-1 */
+ else
+ return one/x-one; /* tanh(NaN) = NaN */
+ }
+
+ if (ix < 0x41b00000) { /* |x|<22 */
+ if (ix == 0)
+ return x; /* x == +-0 */
+ if (ix<0x24000000) /* |x|<2**-55 */
+ return x*(one+x); /* tanh(small) = small */
+ if (ix>=0x3f800000) { /* |x|>=1 */
+ t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
+ z = one - two/(t+two);
+ } else {
+ t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
+ z= -t/(t+two);
+ }
+ } else { /* |x| > 22, return +-1 */
+ z = one - tiny; /* raised inexact flag */
+ }
+ return (jx>=0)? z: -z;
+}
+
+OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+ //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+ float halF = 0.5,
+ huge = 1.0e+30,
+ tiny = 1.0e-30,
+ one = 1.0;
+ float t,w;
+ int ix;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ ix &= 0x7fffffff;
+ /* |x| in [0,22] */
+ if (ix < 0x41b00000) {
+ /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
+ if(ix<0x3eb17218) {
+ t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
+ w = one+t;
+ if (ix<0x24000000) return w; /* cosh(tiny) = 1 */
+ return one+(t*t)/(w+w);
+ }
+ /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
+ t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
+ return halF*t+halF/t;
+ }
+ /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
+ if (ix < 0x42b17180) return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
+ /* |x| in [log(maxdouble), overflowthresold] */
+ if (ix<=0x42b2d4fc) {
+ w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
+ t = halF*w;
+ return t*w;
+ }
+ /* x is INF or NaN */
+ if(ix>=0x7f800000) return x*x;
+ /* |x| > overflowthresold, cosh(x) overflow */
+ return huge*huge;
+}
+
+OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
+ //return x-y*__gen_ocl_rnde(x/y);
+ float zero = 0.0;
+ int hx,hp;
+ unsigned sx;
+ float p_half;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_GET_FLOAT_WORD(hp,p);
+ sx = hx&0x80000000;
+ hp &= 0x7fffffff;
+ hx &= 0x7fffffff;
+ /* purge off exception values */
+ if(hp==0) return (x*p)/(x*p); /* p = 0 */
+ if((hx>=0x7f800000)|| /* x not finite */
+ ((hp>0x7f800000))) /* p is NaN */
+ return (x*p)/(x*p);
+ if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
+ if ((hx-hp)==0) return zero*x;
+ x = __gen_ocl_fabs(x);
+ p = __gen_ocl_fabs(p);
+ if (hp<0x01000000) {
+ if(x+x>p) {
+ x-=p;
+ if(x+x>=p) x -= p;
+ }
+ } else {
+ p_half = (float)0.5*p;
+ if(x>p_half) {
+ x-=p;
+ if(x>=p_half) x -= p;
+ }
+ }
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
+ return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+ x = __gen_ocl_scalbnf(x,n);
+ return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+ //return 0.5f * native_sqrt((1 + x) / (1 - x));
+ float xa = __gen_ocl_fabs (x);
+ float t;
+ if (isless (xa, 0.5f)){
+ if (xa < 0x1.0p-28f) return x;
+ t = xa + xa;
+ t = 0.5f * log1p (t + t * xa / (1.0f - xa));
+ } else if (isless (xa, 1.0f)){
+ t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
+ } else{
+ if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
+ return x / 0.0f;
+ }
+ return __gen_ocl_internal_copysign(t, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+ float px, qx,ans;
+ short n;
+ int i;
+ float*p;
+ float MAXL10 = 38.230809449325611792;
+ float LOG210 = 3.32192809488736234787e0;
+ float LG102A = 3.00781250000000000000E-1;
+ float LG102B = 2.48745663981195213739E-4;
+ float P[6];
+ P[0] = 2.063216740311022E-001;
+ P[1] = 5.420251702225484E-001;
+ P[2] = 1.171292686296281E+000;
+ P[3] = 2.034649854009453E+000;
+ P[4] = 2.650948748208892E+000;
+ P[5] = 2.302585167056758E+000;
+
+ if( x < -MAXL10 ) return 0.0;
+
+ if( isinf(x)) return INFINITY;
+ /* The following is necessary because range reduction blows up: */
+ if( x == 0 )return 1.0;
+
+ /* Express 10**x = 10**g 2**n
+ * = 10**g 10**( n log10(2) )
+ * = 10**( g + n log10(2) )
+ */
+ px = x * LOG210;
+ qx = __gen_ocl_internal_floor( px + 0.5 );
+ n = qx;
+ x -= qx * LG102A;
+ x -= qx * LG102B;
+
+ /* rational approximation for exponential
+ * of the fractional part:
+ * 10**x - 1 = 2x P(x**2)/( Q(x**2) - P(x**2) )
+ */
+ p = P;
+ ans = *p++;
+ i = 5;
+ do{
+ ans = ans * x + *p++;
+ }
+ while( --i );
+ px = 1.0 + x * ans;
+
+ /* multiply by power of 2 */
+ x = __gen_ocl_internal_ldexp( px, n );
+ return x;
+}
+
+OVERLOADABLE float cospi(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_cospi(x);
+
+ return __gen_ocl_internal_cospi(x);
+}
+
+OVERLOADABLE float cosh(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_cosh(x);
+
+ return __gen_ocl_internal_cosh(x);
+}
+
+OVERLOADABLE float acos(float x) {
+ return __gen_ocl_internal_acos(x);
+}
+
+OVERLOADABLE float acospi(float x) {
+ return __gen_ocl_internal_acospi(x);
+}
+
+OVERLOADABLE float acosh(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_acosh(x);
+
+ return __gen_ocl_internal_acosh(x);
+}
+
+OVERLOADABLE float sinpi(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_sinpi(x);
+
+ return __gen_ocl_internal_sinpi(x);
+}
+
+OVERLOADABLE float sinh(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_sinh(x);
+
+ return __gen_ocl_internal_sinh(x);
+}
+
+OVERLOADABLE float asin(float x) {
+ return __gen_ocl_internal_asin(x);
+}
+
+OVERLOADABLE float asinpi(float x) {
+ return __gen_ocl_internal_asinpi(x);
+}
+
+OVERLOADABLE float asinh(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_asinh(x);
+
+ return __gen_ocl_internal_asinh(x);
+}
+
+OVERLOADABLE float tanpi(float x) {
+ return __gen_ocl_internal_tanpi(x);
+}
+
+OVERLOADABLE float tanh(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_tanh(x);
+
+ return __gen_ocl_internal_tanh(x);
+}
+
+OVERLOADABLE float atan(float x) {
+ return __gen_ocl_internal_atan(x);
+}
+
+OVERLOADABLE float atan2(float y, float x) {
+ return __gen_ocl_internal_atan2(y, x);
+}
+
+OVERLOADABLE float atan2pi(float y, float x) {
+ return __gen_ocl_internal_atan2pi(y, x);
+}
+
+OVERLOADABLE float atanpi(float x) {
+ return __gen_ocl_internal_atanpi(x);
+}
+
+OVERLOADABLE float atanh(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_atanh(x);
+
+ return __gen_ocl_internal_atanh(x);
+}
+
+OVERLOADABLE float cbrt(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_cbrt(x);
+
+ return __gen_ocl_internal_cbrt(x);
+}
+
+OVERLOADABLE float rint(float x) {
+ return __gen_ocl_internal_rint(x);
+}
+
+OVERLOADABLE float copysign(float x, float y) {
+ return __gen_ocl_internal_copysign(x, y);
+}
+
+OVERLOADABLE float erf(float x) {
+ return __gen_ocl_internal_erf(x);
+}
+
+OVERLOADABLE float erfc(float x) {
+ return __gen_ocl_internal_erfc(x);
+}
+
+OVERLOADABLE float fmod (float x, float y) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_fmod(x, y);
+
+ return __gen_ocl_internal_fmod(x, y);
+}
+
+OVERLOADABLE float remainder(float x, float p) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_remainder(x, p);
+
+ return __gen_ocl_internal_remainder(x, p);
+}
+
+OVERLOADABLE float ldexp(float x, int n) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_ldexp(x, n);
+
+ if (x == (float)0.0f) x = 0.0f;
+ return __gen_ocl_internal_ldexp(x, n);
+}
+
+CONST OVERLOADABLE float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
+CONST OVERLOADABLE half __gen_ocl_mad(half a, half b, half c) __asm("llvm.fma" ".f16");
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
+
+OVERLOADABLE float mad(float a, float b, float c) {
+ return __gen_ocl_mad(a, b, c);
+}
+
+
+#define BODY \
+ if (isnan(x) || isinf(x)) { \
+ *exp = 0; \
+ return x; \
+ } \
+ uint u = as_uint(x); \
+ uint a = u & 0x7FFFFFFFu; \
+ if (a == 0) { \
+ *exp = 0; \
+ return x; \
+ } \
+ if (a >= 0x800000) { \
+ *exp = (a >> 23) - 126; \
+ return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+ } \
+ int e = -126; \
+ while (a < 0x400000) { \
+ e --; \
+ a <<= 1; \
+ } \
+ a <<= 1; \
+ *exp = e; \
+ return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+OVERLOADABLE float frexp(float x, int *exp) { BODY; }
+#undef BODY
+
+OVERLOADABLE float nextafter(float x, float y) {
+ int hx, hy, ix, iy;
+ hx = as_int(x);
+ hy = as_int(y);
+ ix = hx & 0x7fffffff;
+ iy = hy & 0x7fffffff;
+ if(ix == 0)
+ ix = hx & 0x7fffff;
+ if(iy == 0)
+ iy = hy & 0x7fffff;
+ if(ix>0x7f800000 || iy>0x7f800000)
+ return x+y;
+ if(hx == hy)
+ return y;
+ if(ix == 0) {
+ if(iy == 0)
+ return y;
+ else
+ return as_float((hy&0x80000000) | 1);
+ }
+ if(hx >= 0) {
+ if(hx > hy) {
+ hx -= 1;
+ } else {
+ hx += 1;
+ }
+ } else {
+ if(hy >= 0 || hx > hy){
+ hx -= 1;
+ } else {
+ hx += 1;
+ }
+ }
+ return as_float(hx);
+}
+
+#define BODY \
+ uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+ if (ix > 0x7F800000) { \
+ *i = nan(0u); \
+ return nan(0u); \
+ } \
+ if (ix == 0x7F800000) { \
+ *i = x; \
+ return as_float(hx & 0x80000000u); \
+ } \
+ *i = __gen_ocl_rndz(x); \
+ return x - *i;
+OVERLOADABLE float modf(float x, float *i) { BODY; }
+#undef BODY
+
+OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmax(half a, half b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(half a, half b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+ float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+ return a > b ? x : b > a ? y : max(x, y);
+}
+OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+ float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+ return a < b ? x : b < a ? y : min(x, y);
+}
+OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+ if(isnan(x))
+ return x;
+ if(isnan(y))
+ return y;
+ return x > y ? (x - y) : +0.f;
+}
+/*
+ * the pow/pown high precision implementation are copied from msun library.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+ float z,ax,z_h,z_l,p_h,p_l;
+ float y1,t1,t2,r,s,sn,t,u,v,w;
+ int i,j,k,yisint,n;
+ int hx,hy,ix,iy,is;
+ float bp[2],dp_h[2],dp_l[2],
+ zero = 0.0,
+ one = 1.0,
+ two = 2.0,
+ two24 = 16777216.0, /* 0x4b800000 */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+ L1 = 6.0000002384e-01, /* 0x3f19999a */
+ L2 = 4.2857143283e-01, /* 0x3edb6db7 */
+ P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+ P2 = -2.7777778450e-03, /* 0xbb360b61 */
+ lg2 = 6.9314718246e-01, /* 0x3f317218 */
+ lg2_h = 6.93145752e-01, /* 0x3f317200 */
+ lg2_l = 1.42860654e-06, /* 0x35bfbe8c */
+ ovt = 4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+ cp = 9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+ cp_h = 9.6179199219e-01, /* 0x3f763800 =head of cp */
+ cp_l = 4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ ivln2_h = 1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+ ivln2_l = 7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+ bp[0] = 1.0,bp[1] = 1.5,
+ dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
+ dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ ix = hx&0x7fffffff; iy = hy&0x7fffffff;
+ if (ix < 0x00800000) { /* x < 2**-126 */
+ ix = 0;/* Gen does not support subnormal number now */
+ }
+ if (iy < 0x00800000) { /* y < 2**-126 */
+ iy = 0;/* Gen does not support subnormal number now */
+ }
+ /* y==zero: x**0 = 1 */
+ if(iy==0) return one;
+ /* pow(+1, y) returns 1 for any y, even a NAN */
+ if(hx==0x3f800000) return one;
+ /* +-NaN return x+y */
+ if(ix > 0x7f800000 || iy > 0x7f800000)
+ return (x+0.0f)+y+(0.0f);
+ /* determine if y is an odd int when x < 0
+ * yisint = 0 ... y is not an integer
+ * yisint = 1 ... y is an odd int
+ * yisint = 2 ... y is an even int
+ */
+ yisint = 0;
+ if(hx<0) {
+ if(iy>=0x4b800000) yisint = 2; /* even integer y */
+ else if(iy>=0x3f800000) {
+ k = (iy>>23)-0x7f; /* exponent */
+ j = iy>>(23-k);
+ if((j<<(23-k))==iy) yisint = 2-(j&1);
+ }
+ }
+ /* special value of y */
+ if (iy==0x7f800000) { /* y is +-inf */
+ if (ix==0x3f800000)
+ //return y - y; /* inf**+-1 is NaN */
+ return one;
+ else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
+ return (hy>=0)? y: zero;
+ else /* (|x|<1)**-,+inf = inf,0 */
+ return (hy<0)?-y: zero;
+ }
+ if(iy==0x3f800000) { /* y is +-1 */
+ if(hy<0) return one/x; else return x;
+ }
+ if(hy==0x40000000) return x*x; /* y is 2 */
+ if(hy==0x3f000000) { /* y is 0.5 */
+ if(hx>=0)return __gen_ocl_sqrt(x);
+ }
+
+ ax = __gen_ocl_fabs(x);
+ /* special value of x */
+ if(ix==0x7f800000||ix==0||ix==0x3f800000){
+ z = ax; /*x is +-0,+-inf,+-1*/
+ if(hy<0) z = one/z; /* z = (1/|x|) */
+ if(hx<0) {
+ if(((ix-0x3f800000)|yisint)==0) {
+ z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+ } else if(yisint==1)
+ z = -z; /* (x<0)**odd = -(|x|**odd) */
+ }
+ return z;
+ }
+ n = ((uint)hx>>31)-1;
+
+ /* (x<0)**(non-int) is NaN */
+ if((n|yisint)==0) return (x-x)/(x-x);
+
+ sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+ if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+
+ /* |y| is huge */
+ if(iy>0x4d000000) { /* if |y| > 2**27 */
+ /* over/underflow if x is not close to one */
+ if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
+ if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
+ /* now |1-x| is tiny <= 2**-20, suffice to compute
+ log(x) by x-x^2/2+x^3/3-x^4/4 */
+ t = ax-1; /* t has 20 trailing zeros */
+ w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+ u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
+ v = t*ivln2_l-w*ivln2;
+ t1 = u+v;
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+ t2 = v-(t1-u);
+ } else {
+ float s2,s_h,s_l,t_h,t_l;
+ n = 0;
+ /* take care subnormal number */
+ //if(ix<0x00800000)
+ //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+ n += ((ix)>>23)-0x7f;
+ j = ix&0x007fffff;
+ /* determine interval */
+ ix = j|0x3f800000; /* normalize ix */
+ if(j<=0x1cc471) k=0; /* |x|<sqrt(3/2) */
+ else if(j<0x5db3d7) k=1; /* |x|<sqrt(3) */
+ else {k=0;n+=1;ix -= 0x00800000;}
+ GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+ /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+ u = ax-bp[k]; /* bp[0]=1.0, bp[1]=1.5 */
+ v = one/(ax+bp[k]);
+ s = u*v;
+ s_h = s;
+ GEN_OCL_GET_FLOAT_WORD(is,s_h);
+ GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+ /* t_h=ax+bp[k] High */
+ is = ((ix>>1)&0xfffff000)|0x20000000;
+ GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+ t_l = ax - (t_h-bp[k]);
+ s_l = v*((u-s_h*t_h)-s_h*t_l);
+
+ /* compute log(ax) */
+ s2 = s*s;
+ r = s2*s2*(L1+s2*L2);
+ r += s_l*(s_h+s);
+ s2 = s_h*s_h;
+ t_h = 3.0f+s2+r;
+ GEN_OCL_GET_FLOAT_WORD(is,t_h);
+ GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+ t_l = r-((t_h-3.0f)-s2);
+ /* u+v = s*(1+...) */
+ u = s_h*t_h;
+ v = s_l*t_h+t_l*s;
+ /* 2/(3log2)*(s+...) */
+ p_h = u+v;
+ GEN_OCL_GET_FLOAT_WORD(is,p_h);
+ GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+ p_l = v-(p_h-u);
+ z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
+ z_l = cp_l*p_h+p_l*cp+dp_l[k];
+ /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+ t = (float)n;
+ t1 = (((z_h+z_l)+dp_h[k])+t);
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+ t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+ }
+
+ /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+ GEN_OCL_GET_FLOAT_WORD(is,y);
+ GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
+ p_l = (y-y1)*t1+y*t2;
+ p_h = y1*t1;
+ z = p_l+p_h;
+ GEN_OCL_GET_FLOAT_WORD(j,z);
+ if (j>0x43000000) /* if z > 128 */
+ return sn*huge*huge; /* overflow */
+ else if (j==0x43000000) { /* if z == 128 */
+ if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
+ }
+ else if ((j&0x7fffffff)>0x43160000) /* z <= -150 */
+ return sn*tiny*tiny; /* underflow */
+ else if (j==0xc3160000){ /* z == -150 */
+ if(p_l<=z-p_h) return sn*tiny*tiny; /* underflow */
+ }
+
+ /*
+ * compute 2**(p_h+p_l)
+ */
+ i = j&0x7fffffff;
+ k = (i>>23)-0x7f;
+ n = 0;
+ if(i>0x3f000000) { /* if |z| > 0.5, set n = [z+0.5] */
+ n = j+(0x00800000>>(k+1));
+ k = ((n&0x7fffffff)>>23)-0x7f; /* new k for n */
+ GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+ n = ((n&0x007fffff)|0x00800000)>>(23-k);
+ if(j<0) n = -n;
+ p_h -= t;
+ }
+ t = p_l+p_h;
+ GEN_OCL_GET_FLOAT_WORD(is,t);
+ GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
+ u = t*lg2_h;
+ v = (p_l-(t-p_h))*lg2+t*lg2_l;
+ z = u+v;
+ w = v-(z-u);
+ t = z*z;
+ t1 = z - t*(P1+t*P2);
+ r = (z*t1)/(t1-two)-(w+z*w);
+ z = one-(r-z);
+ GEN_OCL_GET_FLOAT_WORD(j,z);
+ j += (n<<23);
+ if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n); /* subnormal output */
+ else GEN_OCL_SET_FLOAT_WORD(z,j);
+ return sn*z;
+}
+
+OVERLOADABLE float tgamma (float x)
+{
+ /* based on glibc __ieee754_gammaf_r by Ulrich Drepper <drepper at cygnus.com> */
+
+ unsigned int hx;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ if (hx == 0xff800000)
+ {
+ /* x == -Inf. According to ISO this is NaN. */
+ return NAN;
+ }
+ if ((hx & 0x7f800000) == 0x7f800000)
+ {
+ /* Positive infinity (return positive infinity) or NaN (return
+ NaN). */
+ return x;
+ }
+ if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
+ {
+ /* integer x < 0 */
+ return NAN;
+ }
+
+ if (x >= 36.0f)
+ {
+ /* Overflow. */
+ return INFINITY;
+ }
+ else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
+ {
+ return 1.0f / x;
+ }
+ else
+ {
+ float sinpix = __gen_ocl_internal_sinpi(x);
+ if (x <= -42.0f)
+ /* Underflow. */
+ {return 0.0f * sinpix /*for sign*/;}
+ int exp2_adj = 0;
+ float x_abs = __gen_ocl_fabs(x);
+ float gam0;
+
+ if (x_abs < 4.0f) {
+ /* gamma = exp(lgamma) is only accurate for small lgamma */
+ float prod,x_adj;
+ if (x_abs < 0.5f) {
+ prod = 1.0f / x_abs;
+ x_adj = x_abs + 1.0f;
+ } else if (x_abs <= 1.5f) {
+ prod = 1.0f;
+ x_adj = x_abs;
+ } else if (x_abs < 2.5f) {
+ x_adj = x_abs - 1.0f;
+ prod = x_adj;
+ } else {
+ x_adj = x_abs - 2.0f;
+ prod = x_adj * (x_abs - 1.0f);
+ }
+ gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
+ }
+ else {
+ /* Compute gamma (X) using Stirling's approximation,
+ starting by computing pow (X, X) with a power of 2
+ factored out to avoid intermediate overflow. */
+ float x_int = __gen_ocl_internal_round (x_abs);
+ float x_frac = x_abs - x_int;
+ int x_log2;
+ float x_mant = frexp (x_abs, &x_log2);
+ if (x_mant < M_SQRT1_2_F)
+ {
+ x_log2--;
+ x_mant *= 2.0f;
+ }
+ exp2_adj = x_log2 * (int) x_int;
+ float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
+ * exp2 (x_log2 * x_frac)
+ * __gen_ocl_internal_exp (-x_abs)
+ * sqrt (2.0f * M_PI_F / x_abs) );
+
+ float x2 = x_abs * x_abs;
+ float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
+ gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
+ }
+ if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
+ float gam1 = M_PI_F / (-x * sinpix * gam0);
+ return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
+ }
+}
+
+float __gen_ocl_internal_pown(float x, int y) {
+ const float
+ bp[] = {1.0, 1.5,},
+ dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */
+ dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */
+ zero = 0.0,
+ one = 1.0,
+ two = 2.0,
+ two24 = 16777216.0, /* 0x4b800000 */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+ L1 = 6.0000002384e-01, /* 0x3f19999a */
+ L2 = 4.2857143283e-01, /* 0x3edb6db7 */
+ P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+ P2 = -2.7777778450e-03, /* 0xbb360b61 */
+ lg2 = 6.9314718246e-01, /* 0x3f317218 */
+ lg2_h = 0x1.62ep-1,
+ lg2_l = 0x1.0bfbe8p-15,
+ ovt = 4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+ cp = 9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+ cp_h = 9.6179199219e-01, /* 0x3f763800 =head of cp */
+ cp_l = 4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ ivln2_h = 1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+ ivln2_l = 7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+
+ float z,ax,z_h,z_l,p_h,p_l;
+ float y1,t1,t2,r,s,t,u,v,w;
+ int i,j,k,yisint,n;
+ int hx,ix,iy,is;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ iy = y > 0 ? y&0x7fffffff : (-y)&0x7fffffff;
+ /* y==zero: x**0 = 1 */
+ if(y==0) return one;
+
+ /* +-NaN return NAN */
+ if(ix > 0x7f800000)
+ return NAN;
+
+ /* determine if y is an odd int
+ * yisint = 1 ... y is an odd int
+ * yisint = 2 ... y is an even int
+ */
+ yisint = y&1 ? 1 : 2;
+
+ if (y == 1) return x;
+ if (y == -1) return one/x;
+ if (y == 2) return x*x;
+
+ ax = __gen_ocl_fabs(x);
+
+ /* special value of x */
+ if(ix==0x7f800000||ix==0||ix==0x3f800000){
+ z = ax; /*x is +-0,+-inf,+-1*/
+ if(y<0) z = one/z; /* z = (1/|x|) */
+ if(hx<0) {
+ if(yisint==1)
+ z = -z; /* (x<0)**odd = -(|x|**odd) */
+ }
+ return z;
+ }
+
+ float sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+ if(((((unsigned)hx>>31)-1)|(yisint-1))==0)
+ sn = -one; /* (-ve)**(odd int) */
+
+ /* |y| is huge */
+ if(iy>0x08000000) { /* if |y| > 2**27 */
+ /* over/underflow if x is not close to one */
+ if(ix<0x3f7ffff8) return (y<0)? sn*huge*huge:tiny*tiny;
+ if(ix>0x3f800007) return (y>0)? sn*huge*huge:tiny*tiny;
+ /* now |1-x| is tiny <= 2**-20, suffice to compute
+ log(x) by x-x^2/2+x^3/3-x^4/4 */
+ t = ax-1; /* t has 20 trailing zeros */
+ w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
+ u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
+ v = t*ivln2_l-w*ivln2;
+ t1 = u+v;
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+ t2 = v-(t1-u);
+ } else {
+ float s2,s_h,s_l,t_h,t_l;
+ n = 0;
+ /* take care subnormal number */
+// if(ix<0x00800000)
+// {ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+ n += ((ix)>>23)-0x7f;
+ j = ix&0x007fffff;
+ /* determine interval */
+ ix = j|0x3f800000; /* normalize ix */
+ if(j<=0x1cc471) k=0; /* |x|<sqrt(3/2) */
+ else if(j<0x5db3d7) k=1; /* |x|<sqrt(3) */
+ else {k=0;n+=1;ix -= 0x00800000;}
+ GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+ /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+ u = ax-bp[k]; /* bp[0]=1.0, bp[1]=1.5 */
+ v = one/(ax+bp[k]);
+ s = u*v;
+ s_h = s;
+ GEN_OCL_GET_FLOAT_WORD(is,s_h);
+ GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+
+ /* t_h=ax+bp[k] High */
+ GEN_OCL_SET_FLOAT_WORD(t_h, (((ix>>1)|0x20000000)+0x00400000+(k<<21)) &0xfffff000);
+ t_l = ax - (t_h-bp[k]);
+ s_l = v*((u-s_h*t_h)-s_h*t_l);
+
+
+ /* compute log(ax) */
+ s2 = s*s;
+ r = s2*s2*(L1+s2*L2);
+ r += s_l*(s_h+s);
+ s2 = s_h*s_h;
+ t_h = (float)3.0+s2+r;
+ GEN_OCL_GET_FLOAT_WORD(is,t_h);
+ GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+ t_l = r-((t_h-(float)3.0)-s2);
+ /* u+v = s*(1+...) */
+ u = s_h*t_h;
+ v = s_l*t_h+t_l*s;
+ /* 2/(3log2)*(s+...) */
+ p_h = u+v;
+ GEN_OCL_GET_FLOAT_WORD(is,p_h);
+ GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+ p_l = v-(p_h-u);
+ z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
+ z_l = cp_l*p_h+p_l*cp+dp_l[k];
+ /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+ t = (float)n;
+ t1 = (((z_h+z_l)+dp_h[k])+t);
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+ t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+ }
+
+ /* split up y into y1+y2+y3 and compute (y1+y2+y3)*(t1+t2) */
+
+ float fy = (float)y;
+ float y3 = (float)(y-(int)fy);
+ GEN_OCL_GET_FLOAT_WORD(is,fy);
+ GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+
+ p_l = (fy-y1)*t1 + y3*t1 + fy*t2 + y3*t2;
+ p_h = y1*t1;
+ z = p_l+p_h;
+
+ GEN_OCL_GET_FLOAT_WORD(j,z);
+ if (j>0x43000000) /* if z > 128 */
+ return sn*huge*huge; /* overflow */
+ else if (j==0x43000000) { /* if z == 128 */
+ if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
+ }
+ else if ((j&0x7fffffff)>0x43160000) /* z <= -150 */
+ return sn*tiny*tiny; /* underflow */
+ else if (j==0xc3160000){ /* z == -150 */
+ if(p_l<=z-p_h) return sn*tiny*tiny; /* underflow */
+ }
+ /*
+ * compute 2**(p_h+p_l)
+ */
+ i = j&0x7fffffff;
+ k = (i>>23)-0x7f;
+ n = 0;
+ if(i>0x3f000000) { /* if |z| > 0.5, set n = [z+0.5] */
+ n = j+(0x00800000>>(k+1));
+ k = ((n&0x7fffffff)>>23)-0x7f; /* new k for n */
+ GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+ n = ((n&0x007fffff)|0x00800000)>>(23-k);
+ if(j<0) n = -n;
+ p_h -= t;
+
+ z -= n;
+ }
+
+ t = z;
+ GEN_OCL_GET_FLOAT_WORD(is,t);
+ GEN_OCL_SET_FLOAT_WORD(t,is&0xfffff000);
+ u = t*lg2_h;
+ v = (p_l-(t-p_h))*lg2+t*lg2_l;
+ z = u+v;
+ w = v-(z-u);
+ t = z*z;
+ t1 = z - t*(P1+t*P2);
+ r = (z*t1)/(t1-two)-(w+z*w);
+ z = one-(r-z);
+ GEN_OCL_GET_FLOAT_WORD(j,z);
+ j += (n<<23);
+ if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n); /* subnormal output */
+ else GEN_OCL_SET_FLOAT_WORD(z,j);
+ return sn*z;
+}
+
+OVERLOADABLE float hypot(float x, float y) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_hypot(x, y);
+
+ //return __gen_ocl_sqrt(x*x + y*y);
+ float a,b,an,bn,cn;
+ int e;
+ if (isfinite (x) && isfinite (y)){ /* Determine absolute values. */
+ x = __gen_ocl_fabs (x);
+ y = __gen_ocl_fabs (y);
+ /* Find the bigger and the smaller one. */
+ a = max(x,y);
+ b = min(x,y);
+ /* Now 0 <= b <= a. */
+ /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1. */
+ an = frexp (a, &e);
+ bn = ldexp (b, - e);
+ /* Through the normalization, no unneeded overflow or underflow will occur here. */
+ cn = __gen_ocl_sqrt (an * an + bn * bn);
+ return ldexp (cn, e);
+ }else{
+ if (isinf (x) || isinf (y)) /* x or y is infinite. Return +Infinity. */
+ return INFINITY;
+ else /* x or y is NaN. Return NaN. */
+ return x + y;
+ }
+}
+
+#define BODY \
+ if (isnan(x)) { \
+ *p = x; \
+ return x; \
+ } \
+ *p = __gen_ocl_internal_floor(x); \
+ if (isinf(x)) { \
+ return x > 0 ? +0. : -0.; \
+ } \
+ return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+OVERLOADABLE float fract(float x, float *p) { BODY; }
+#undef BODY
+
+#define BODY \
+ float Zero[2]; \
+ int n,hx,hy,hz,ix,iy,sx,i,sy; \
+ uint q,sxy; \
+ Zero[0] = 0.0;Zero[1] = -0.0; \
+ if (x == 0.0f) { x = 0.0f; }; \
+ if (y == 0.0f) { y = 0.0f; }\
+ GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
+ sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
+ hx ^=sx; hy &= 0x7fffffff; \
+ if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
+ if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
+ *quo = 0;return NAN; \
+ } \
+ if( hy == 0x7F800000 || hx == 0 ) { \
+ *quo = 0;return x; \
+ } \
+ if( hx == hy ) { \
+ *quo = (x == y) ? 1 : -1; \
+ return sx ? -0.0 : 0.0; \
+ } \
+ if(hx<hy) { \
+ q = 0; \
+ goto fixup; \
+ } else if(hx==hy) { \
+ *quo = (sxy ? -1 : 1); \
+ return Zero[(uint)sx>>31]; \
+ } \
+ ix = (hx>>23)-127; \
+ iy = (hy>>23)-127; \
+ hx = 0x00800000|(0x007fffff&hx); \
+ hy = 0x00800000|(0x007fffff&hy); \
+ n = ix - iy; \
+ q = 0; \
+ while(n--) { \
+ hz=hx-hy; \
+ if(hz<0) hx = hx << 1; \
+ else {hx = hz << 1; q++;} \
+ q <<= 1; \
+ } \
+ hz=hx-hy; \
+ if(hz>=0) {hx=hz;q++;} \
+ if(hx==0) { \
+ q &= 0x0000007f; \
+ *quo = (sxy ? -q : q); \
+ return Zero[(uint)sx>>31]; \
+ } \
+ while(hx<0x00800000) { \
+ hx <<= 1;iy -= 1; \
+ } \
+ if(iy>= -126) { \
+ hx = ((hx-0x00800000)|((iy+127)<<23)); \
+ } else {\
+ n = -126 - iy; \
+ hx >>= n; \
+ } \
+fixup: \
+ GEN_OCL_SET_FLOAT_WORD(x,hx); \
+ if(hx<0x00800000){ \
+ GEN_OCL_GET_FLOAT_WORD(hy,y); \
+ hy &= 0x7fffffff; \
+ if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
+ x = 0; \
+ }else{ \
+ y = __gen_ocl_fabs(y); \
+ if (y < 0x1p-125f) { \
+ if (x+x>y || (x+x==y && (q & 1))) { \
+ q++;x-=y; \
+ } \
+ }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
+ q++;x-=y; \
+ } \
+ GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
+ } \
+ int sign = sx==sy?0:1; \
+ q &= 0x0000007f; \
+ *quo = (sign ? -q : q); \
+ return x;
+
+OVERLOADABLE float remquo(float x, float y, int *quo) {
+ BODY;
+}
+#undef BODY
+
+OVERLOADABLE float powr(float x, float y) {
+ unsigned int hx, sx, hy, sy;
+
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_pow(x,y);
+ else {
+ if (isnan(x) || isnan(y)) return NAN;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ sx = (hx & 0x80000000) >> 31;
+ sy = (hy & 0x80000000) >> 31;
+
+ if ((hx&0x7fffffff) < 0x00800000) { /* x < 2**-126 */
+ x = 0.0f;/* Gen does not support subnormal number now */
+ hx = hx &0x80000000;
+ }
+ if ((hy&0x7fffffff) < 0x00800000) { /* y < 2**-126 */
+ y = 0.0;/* Gen does not support subnormal number now */
+ hy = hy &0x80000000;
+ }
+
+ // (x < 0) ** y = NAN (y!=0)
+ if ((sx && (hx & 0x7fffffff))) return NAN;
+
+ // +/-0 ** +/-0 = NAN
+ if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
+
+ // +inf ** +/-0 = NAN
+ if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
+
+ // others except nan/inf/0 ** 0 = 1.0
+ if (!(hy&0x7fffffff)) return 1.0f;
+
+ // +1 ** inf = NAN; +1 ** finite = 1;
+ if (hx == 0x3f800000) {
+ return isinf(y) ? NAN : 1.0f;
+ }
+
+ if ( !(hx & 0x7fffffff)) {
+ // +/-0 ** y<0 = +inf
+ // +/-0 ** y>0 = +0
+ return sy ? INFINITY : 0.0f;
+ }
+
+ return __gen_ocl_internal_pow(x,y);
+ }
+}
+
+OVERLOADABLE float pown(float x, int n) {
+ if (__ocl_math_fastpath_flag) {
+ if (x == 0.f && n == 0)
+ return 1.f;
+ if (x < 0.f && (n&1) )
+ return -powr(-x, n);
+ return powr(x, n);
+ } else {
+ int ix;
+ GEN_OCL_GET_FLOAT_WORD(ix, x);
+ float sign = ix < 0 ? -1.0f : 1.0f;
+ if (x == 0.0f) x = sign * 0.0f;
+
+ return __gen_ocl_internal_pown(x, n);
+ }
+}
+
+OVERLOADABLE float pow(float x, float y) {
+ if (!__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_pow(x,y);
+ else {
+ int n;
+ if (x == 0.f && y == 0.f)
+ return 1.f;
+ if (x >= 0.f)
+ return powr(x, y);
+ n = y;
+ if ((float)n == y)//is exact integer
+ return pown(x, n);
+ return NAN;
+ }
+}
+
+OVERLOADABLE float rootn(float x, int n) {
+ float ax,re;
+ int sign = 0;
+ int hx;
+ if( n == 0 )return NAN;
+
+ GEN_OCL_GET_FLOAT_WORD(hx, x);
+ // Gen does not support denorm, flush to zero
+ if ((hx & 0x7fffffff) < 0x00800000) {
+ x = hx < 0 ? -0.0f : 0.0f;
+ }
+
+ //rootn ( x, n ) returns a NaN for x < 0 and n is even.
+ if( x < 0 && 0 == (n&1) )
+ return NAN;
+ if( x == 0.0 ){
+ switch( n & 0x80000001 ){
+ //rootn ( +-0, n ) is +0 for even n > 0.
+ case 0:
+ return 0.0f;
+ //rootn ( +-0, n ) is +-0 for odd n > 0.
+ case 1:
+ return x;
+ //rootn ( +-0, n ) is +inf for even n < 0.
+ case 0x80000000:
+ return INFINITY;
+
+ //rootn ( +-0, n ) is +-inf for odd n < 0.
+ case 0x80000001:
+ return __gen_ocl_internal_copysign(INFINITY, x);
+ }
+ }
+ ax = __gen_ocl_fabs(x);
+ if(x <0.0f && (n&1))
+ sign = 1;
+ if (__ocl_math_fastpath_flag)
+ re = __gen_ocl_pow(ax, 1.f/n);
+ else
+ re = __gen_ocl_internal_pow(ax,1.f/n);
+ if(sign)
+ re = -re;
+ return re;
+}
+
+OVERLOADABLE float fabs(float x) {
+ return __gen_ocl_internal_fabs(x);
+}
+
+OVERLOADABLE float trunc(float x) {
+ return __gen_ocl_internal_trunc(x);
+}
+
+OVERLOADABLE float round(float x) {
+ return __gen_ocl_internal_round(x);
+}
+
+OVERLOADABLE float floor(float x) {
+ return __gen_ocl_internal_floor(x);
+}
+
+OVERLOADABLE float ceil(float x) {
+ return __gen_ocl_internal_ceil(x);
+}
+
+OVERLOADABLE float log(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_log(x);
+
+ /* Use native instruction when it has enough precision */
+ if((x > 0x1.1p0) || (x <= 0))
+ return __gen_ocl_internal_fastpath_log(x);
+
+ return __gen_ocl_internal_log(x);
+}
+
+OVERLOADABLE float log2(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_log2(x);
+
+ /* Use native instruction when it has enough precision */
+ if((x > 0x1.1p0) || (x <= 0))
+ return __gen_ocl_internal_fastpath_log2(x);
+
+ return __gen_ocl_internal_log2(x);
+}
+
+OVERLOADABLE float log10(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_log10(x);
+
+ /* Use native instruction when it has enough precision */
+ if((x > 0x1.1p0) || (x <= 0))
+ return __gen_ocl_internal_fastpath_log10(x);
+
+ return __gen_ocl_internal_log10(x);
+}
+
+OVERLOADABLE float exp(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_exp(x);
+
+ /* Use native instruction when it has enough precision */
+ if (x > -0x1.6p1 && x < 0x1.6p1)
+ return __gen_ocl_internal_fastpath_exp(x);
+
+ return __gen_ocl_internal_exp(x);
+}
+
+OVERLOADABLE float exp2(float x) {
+ /* Use native instruction when it has enough precision, exp2 always */
+ return native_exp2(x);
+}
+
+OVERLOADABLE float exp10(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_exp10(x);
+
+ return __gen_ocl_internal_exp10(x);
+}
+
+OVERLOADABLE float expm1(float x) {
+ if (__ocl_math_fastpath_flag)
+ return __gen_ocl_internal_fastpath_expm1(x);
+
+ return __gen_ocl_internal_expm1(x);
+}
+
+OVERLOADABLE float fmin(float a, float b) {
+ return __gen_ocl_internal_fmin(a, b);
+}
+
+OVERLOADABLE float fmax(float a, float b) {
+ return __gen_ocl_internal_fmax(a, b);
+}
+
+OVERLOADABLE float fma(float a, float b, float c) {
+ return mad(a, b, c);
+}
+
+OVERLOADABLE float fdim(float x, float y) {
+ return __gen_ocl_internal_fdim(x, y);
+}
+
+OVERLOADABLE float maxmag(float x, float y) {
+ return __gen_ocl_internal_maxmag(x, y);
+}
+
+OVERLOADABLE float minmag(float x, float y) {
+ return __gen_ocl_internal_minmag(x, y);
+}
+
+
+/* So far, the HW do not support half float math function.
+ We just do the conversion and call the float version here. */
+OVERLOADABLE half cospi(half x) {
+ float _x = (float)x;
+ return (half)cospi(_x);
+}
+OVERLOADABLE half cosh(half x) {
+ float _x = (float)x;
+ return (half)cosh(_x);
+}
+OVERLOADABLE half acos(half x) {
+ float _x = (float)x;
+ return (half)acos(_x);
+}
+OVERLOADABLE float half_cos(float x) {
+ return (float)cos(x);
+}
+OVERLOADABLE float half_divide(float x, float y) {
+ return (float)native_divide(x, y);
+}
+OVERLOADABLE float half_exp(float x) {
+ return (float)native_exp(x);
+}
+OVERLOADABLE float half_exp2(float x){
+ return (float)native_exp2(x);
+}
+OVERLOADABLE float half_exp10(float x){
+ return (float)native_exp10(x);
+}
+OVERLOADABLE float half_log(float x){
+ return (float)native_log(x);
+}
+OVERLOADABLE float half_log2(float x){
+ return (float)native_log2(x);
+}
+OVERLOADABLE float half_log10(float x){
+ return (float)native_log10(x);
+}
+OVERLOADABLE float half_powr(float x, float y){
+ return (float)powr(x, y);
+}
+OVERLOADABLE float half_recip(float x){
+ return (float)native_recip(x);
+}
+OVERLOADABLE float half_rsqrt(float x){
+ return (float)native_rsqrt(x);
+}
+OVERLOADABLE float half_sin(float x){
+ return (float)sin(x);
+}
+OVERLOADABLE float half_sqrt(float x){
+ return (float)native_sqrt(x);
+}
+OVERLOADABLE float half_tan(float x){
+ return (float)tan(x);
+}
+OVERLOADABLE half acospi(half x) {
+ float _x = (float)x;
+ return (half)acospi(_x);
+}
+OVERLOADABLE half acosh(half x) {
+ float _x = (float)x;
+ return (half)acosh(_x);
+}
+OVERLOADABLE half sinpi(half x) {
+ float _x = (float)x;
+ return (half)sinpi(_x);
+}
+OVERLOADABLE half sinh(half x) {
+ float _x = (float)x;
+ return (half)sinh(_x);
+}
+OVERLOADABLE half asin(half x) {
+ float _x = (float)x;
+ return (half)asin(_x);
+}
+OVERLOADABLE half asinpi(half x) {
+ float _x = (float)x;
+ return (half)asinpi(_x);
+}
+OVERLOADABLE half asinh(half x) {
+ float _x = (float)x;
+ return (half)asinh(_x);
+}
+OVERLOADABLE half tanpi(half x) {
+ float _x = (float)x;
+ return (half)tanpi(_x);
+}
+OVERLOADABLE half tanh(half x) {
+ float _x = (float)x;
+ return (half)tanh(_x);
+}
+OVERLOADABLE half atan(half x) {
+ float _x = (float)x;
+ return (half)atan(_x);
+}
+OVERLOADABLE half atan2(half y, half x) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)atan2(_x, _y);
+}
+OVERLOADABLE half atan2pi(half y, half x) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)atan2pi(_x, _y);
+}
+OVERLOADABLE half atanpi(half x) {
+ float _x = (float)x;
+ return (half)atanpi(_x);
+}
+OVERLOADABLE half atanh(half x) {
+ float _x = (float)x;
+ return (half)atanh(_x);
+}
+OVERLOADABLE half cbrt(half x) {
+ float _x = (float)x;
+ return (half)cbrt(_x);
+}
+OVERLOADABLE half rint(half x) {
+ float _x = (float)x;
+ return (half)rint(_x);
+}
+OVERLOADABLE half copysign(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)copysign(_x, _y);
+}
+OVERLOADABLE half erf(half x) {
+ float _x = (float)x;
+ return (half)erf(_x);
+}
+OVERLOADABLE half erfc(half x) {
+ float _x = (float)x;
+ return (half)erfc(_x);
+}
+OVERLOADABLE half fmod(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)fmod(_x, _y);
+}
+OVERLOADABLE half remainder(half x, half p) {
+ float _x = (float)x;
+ float _p = (float)p;
+ return (half)remainder(_x, _p);
+}
+OVERLOADABLE half ldexp(half x, int n) {
+ float _x = (float)x;
+ return (half)ldexp(_x, n);
+}
+OVERLOADABLE half powr(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)powr(_x, _y);
+}
+OVERLOADABLE half pow(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)pow(_x, _y);
+}
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x) {
+ float _x = (float)x;
+ return (half)fabs(_x);
+}
+OVERLOADABLE half trunc(half x) {
+ float _x = (float)x;
+ return (half)trunc(_x);
+}
+OVERLOADABLE half round(half x) {
+ float _x = (float)x;
+ return (half)round(_x);
+}
+OVERLOADABLE half floor(half x) {
+ float _x = (float)x;
+ return (half)floor(_x);
+}
+OVERLOADABLE half ceil(half x) {
+ float _x = (float)x;
+ return (half)ceil(_x);
+}
+OVERLOADABLE half log(half x) {
+ float _x = (float)x;
+ return (half)log(_x);
+}
+OVERLOADABLE half log2(half x) {
+ float _x = (float)x;
+ return (half)log2(_x);
+}
+OVERLOADABLE half log10(half x) {
+ float _x = (float)x;
+ return (half)log10(_x);
+}
+OVERLOADABLE half exp(half x) {
+ float _x = (float)x;
+ return (half)exp(_x);
+}
+OVERLOADABLE half exp10(half x) {
+ float _x = (float)x;
+ return (half)exp10(_x);
+}
+OVERLOADABLE half expm1(half x) {
+ float _x = (float)x;
+ return (half)expm1(_x);
+}
+OVERLOADABLE half fmin(half a, half b) {
+ return __gen_ocl_internal_fmin(a, b);
+}
+OVERLOADABLE half fmax(half a, half b) {
+ return __gen_ocl_internal_fmax(a, b);
+}
+OVERLOADABLE half fma(half a, half b, half c) {
+ float _a = (float)a;
+ float _b = (float)b;
+ float _c = (float)c;
+ return (half)fma(_a, _b, _c);
+}
+OVERLOADABLE half fdim(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)fdim(_x, _y);
+}
+OVERLOADABLE half maxmag(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)maxmag(_x, _y);
+}
+OVERLOADABLE half minmag(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)minmag(_x, _y);
+}
+OVERLOADABLE half exp2(half x) {
+ float _x = (float)x;
+ return (half)exp2(_x);
+}
+OVERLOADABLE half mad(half a, half b, half c) {
+ return __gen_ocl_mad(a,b,c);
+}
+OVERLOADABLE half sin(half x) {
+ float _x = (float)x;
+ return (half)sin(_x);
+}
+OVERLOADABLE half cos(half x) {
+ float _x = (float)x;
+ return (half)cos(_x);
+}
+OVERLOADABLE half tan(half x) {
+ float _x = (float)x;
+ return (half)tan(_x);
+}
+OVERLOADABLE half tgamma(half x) {
+ float _x = (float)x;
+ return (half)tgamma(_x);
+}
+OVERLOADABLE half lgamma(half x) {
+ float _x = (float)x;
+ return (half)lgamma(_x);
+}
+OVERLOADABLE half lgamma_r(half x, int *signgamp) {
+ float _x = (float)x;
+ return (half)lgamma_r(_x, signgamp);
+}
+OVERLOADABLE half log1p(half x) {
+ float _x = (float)x;
+ return (half)log1p(_x);
+}
+OVERLOADABLE half logb(half x) {
+ float _x = (float)x;
+ return (half)logb(_x);
+}
+OVERLOADABLE int ilogb(half x) {
+ float _x = (float)x;
+ return ilogb(_x);
+}
+OVERLOADABLE half nan(ushort code) {
+ return (half)NAN;
+}
+
+OVERLOADABLE half sincos(half x, half *cosval) {
+ float _x = (float)x;
+ float _cosval;
+ half ret = (half)sincos(_x, &_cosval);
+ *cosval = (half)_cosval;
+ return ret;
+}
+
+OVERLOADABLE half sqrt(half x) {
+ float _x = (float)x;
+ return (half)sqrt(_x);
+}
+OVERLOADABLE half rsqrt(half x) {
+ float _x = (float)x;
+ return (half)rsqrt(_x);
+}
+OVERLOADABLE half frexp(half x, int *exp) {
+ float _x = (float)x;
+ return (half)frexp(_x, exp);
+}
+OVERLOADABLE half nextafter(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)nextafter(_x, _y);
+}
+
+OVERLOADABLE half modf(half x, half *i) {
+ float _x = (float)x;
+ float _i;
+ half ret = (half)modf(_x, &_i);
+ *i = (half)_i;
+ return ret;
+}
+
+OVERLOADABLE half hypot(half x, half y) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)hypot(_x, _y);
+}
+
+OVERLOADABLE half fract(half x, half *p) {
+ float _x = (float)x;
+ float _p;
+ half ret = (half)fract(_x, &_p);
+ *p = (half)_p;
+ return ret;
+}
+
+OVERLOADABLE half remquo(half x, half y, int *quo) {
+ float _x = (float)x;
+ float _y = (float)y;
+ return (half)remquo(_x, _y, quo);
+}
+
+OVERLOADABLE half pown(half x, int n) {
+ float _x = (float)x;
+ return (half)pown(_x, n);
+}
+OVERLOADABLE half rootn(half x, int n) {
+ float _x = (float)x;
+ return (half)rootn(_x, n);
+}
diff --git a/backend/src/libocl/tmpl/ocl_math_20.tmpl.h b/backend/src/libocl/tmpl/ocl_math_20.tmpl.h
new file mode 100644
index 0000000..271075c
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_math_20.tmpl.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_MATH_20_H__
+#define __OCL_MATH_20_H__
+
+#include "ocl_types.h"
+
+OVERLOADABLE float cospi(float x);
+OVERLOADABLE float cosh(float x);
+OVERLOADABLE float acos(float x);
+OVERLOADABLE float acospi(float x);
+OVERLOADABLE float acosh(float x);
+OVERLOADABLE float sinpi(float x);
+OVERLOADABLE float sinh(float x);
+OVERLOADABLE float asin(float x);
+OVERLOADABLE float asinpi(float x);
+OVERLOADABLE float asinh(float x);
+OVERLOADABLE float tanpi(float x);
+OVERLOADABLE float tanh(float x);
+OVERLOADABLE float atan(float x);
+OVERLOADABLE float atan2(float y, float x);
+OVERLOADABLE float atan2pi(float y, float x);
+OVERLOADABLE float atanpi(float x);
+OVERLOADABLE float atanh(float x);
+OVERLOADABLE float cbrt(float x);
+OVERLOADABLE float rint(float x);
+OVERLOADABLE float copysign(float x, float y);
+OVERLOADABLE float erf(float x);
+OVERLOADABLE float erfc(float x);
+OVERLOADABLE float fmod (float x, float y);
+OVERLOADABLE float remainder(float x, float p);
+OVERLOADABLE float ldexp(float x, int n);
+OVERLOADABLE float powr(float x, float y);
+OVERLOADABLE float pow(float x, float y);
+//no pow, we use powr instead
+OVERLOADABLE float fabs(float x);
+OVERLOADABLE float trunc(float x);
+OVERLOADABLE float round(float x);
+OVERLOADABLE float floor(float x);
+OVERLOADABLE float ceil(float x);
+OVERLOADABLE float log(float x);
+OVERLOADABLE float log2(float x);
+OVERLOADABLE float log10(float x);
+OVERLOADABLE float exp(float x);
+OVERLOADABLE float exp10(float x);
+OVERLOADABLE float expm1(float x);
+OVERLOADABLE float fmin(float a, float b);
+OVERLOADABLE float fmax(float a, float b);
+OVERLOADABLE float fma(float a, float b, float c);
+OVERLOADABLE float fdim(float x, float y);
+OVERLOADABLE float maxmag(float x, float y);
+OVERLOADABLE float minmag(float x, float y);
+OVERLOADABLE float exp2(float x);
+OVERLOADABLE float mad(float a, float b, float c);
+OVERLOADABLE float sin(float x);
+OVERLOADABLE float cos(float x);
+OVERLOADABLE float tan(float x);
+OVERLOADABLE float tgamma(float x);
+OVERLOADABLE float lgamma(float x);
+OVERLOADABLE float lgamma_r(float x, int *signgamp);
+OVERLOADABLE float log1p(float x);
+OVERLOADABLE float logb(float x);
+OVERLOADABLE int ilogb(float x);
+OVERLOADABLE float nan(uint code);
+OVERLOADABLE float sincos(float x, float *cosval);
+OVERLOADABLE float sqrt(float x);
+OVERLOADABLE float rsqrt(float x);
+OVERLOADABLE float frexp(float x, int *exp);
+OVERLOADABLE float nextafter(float x, float y);
+OVERLOADABLE float modf(float x, float *i);
+OVERLOADABLE float hypot(float x, float y);
+OVERLOADABLE float fract(float x, float *p);
+OVERLOADABLE float remquo(float x, float y, int *quo);
+OVERLOADABLE float pown(float x, int n);
+OVERLOADABLE float rootn(float x, int n);
+
+// native
+OVERLOADABLE float native_cos(float x);
+OVERLOADABLE float native_divide(float x, float y);
+OVERLOADABLE float native_exp(float x);
+OVERLOADABLE float native_exp2(float x);
+OVERLOADABLE float native_exp10(float x);
+OVERLOADABLE float native_log(float x);
+OVERLOADABLE float native_log2(float x);
+OVERLOADABLE float native_log10(float x);
+OVERLOADABLE float native_powr(float x, float y);
+OVERLOADABLE float native_recip(float x);
+OVERLOADABLE float native_rsqrt(float x);
+OVERLOADABLE float native_sin(float x);
+OVERLOADABLE float native_sqrt(float x);
+OVERLOADABLE float native_tan(float x);
+
+
+// Half float version.
+OVERLOADABLE half cospi(half x);
+OVERLOADABLE half cosh(half x);
+OVERLOADABLE half acos(half x);
+OVERLOADABLE half acospi(half x);
+OVERLOADABLE half acosh(half x);
+OVERLOADABLE half sinpi(half x);
+OVERLOADABLE half sinh(half x);
+OVERLOADABLE half asin(half x);
+OVERLOADABLE half asinpi(half x);
+OVERLOADABLE half asinh(half x);
+OVERLOADABLE half tanpi(half x);
+OVERLOADABLE half tanh(half x);
+OVERLOADABLE half atan(half x);
+OVERLOADABLE half atan2(half y, half x);
+OVERLOADABLE half atan2pi(half y, half x);
+OVERLOADABLE half atanpi(half x);
+OVERLOADABLE half atanh(half x);
+OVERLOADABLE half cbrt(half x);
+OVERLOADABLE half rint(half x);
+OVERLOADABLE half copysign(half x, half y);
+OVERLOADABLE half erf(half x);
+OVERLOADABLE half erfc(half x);
+OVERLOADABLE half fmod (half x, half y);
+OVERLOADABLE half remainder(half x, half p);
+OVERLOADABLE half ldexp(half x, int n);
+OVERLOADABLE half powr(half x, half y);
+OVERLOADABLE half pow(half x, half y);
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x);
+OVERLOADABLE half trunc(half x);
+OVERLOADABLE half round(half x);
+OVERLOADABLE half floor(half x);
+OVERLOADABLE half ceil(half x);
+OVERLOADABLE half log(half x);
+OVERLOADABLE half log2(half x);
+OVERLOADABLE half log10(half x);
+OVERLOADABLE half exp(half x);
+OVERLOADABLE half exp10(half x);
+OVERLOADABLE half expm1(half x);
+OVERLOADABLE half fmin(half a, half b);
+OVERLOADABLE half fmax(half a, half b);
+OVERLOADABLE half fma(half a, half b, half c);
+OVERLOADABLE half fdim(half x, half y);
+OVERLOADABLE half maxmag(half x, half y);
+OVERLOADABLE half minmag(half x, half y);
+OVERLOADABLE half exp2(half x);
+OVERLOADABLE half mad(half a, half b, half c);
+OVERLOADABLE half sin(half x);
+OVERLOADABLE half cos(half x);
+OVERLOADABLE half tan(half x);
+OVERLOADABLE half tgamma(half x);
+OVERLOADABLE half lgamma(half x);
+OVERLOADABLE half lgamma_r(half x, int *signgamp);
+OVERLOADABLE half log1p(half x);
+OVERLOADABLE half logb(half x);
+OVERLOADABLE int ilogb(half x);
+OVERLOADABLE half nan(ushort code);
+OVERLOADABLE half sincos(half x, half *cosval);
+OVERLOADABLE half sqrt(half x);
+OVERLOADABLE half rsqrt(half x);
+OVERLOADABLE half frexp(half x, int *exp);
+OVERLOADABLE half nextafter(half x, half y);
+OVERLOADABLE half modf(half x, half *i);
+OVERLOADABLE half hypot(half x, half y);
+OVERLOADABLE half fract(half x, half *p);
+OVERLOADABLE half remquo(half x, half y, int *quo);
+OVERLOADABLE half pown(half x, int n);
+OVERLOADABLE half rootn(half x, int n);
+
+// native half
+OVERLOADABLE half native_cos(half x);
+OVERLOADABLE half native_divide(half x, half y);
+OVERLOADABLE half native_exp(half x);
+OVERLOADABLE half native_exp2(half x);
+OVERLOADABLE half native_exp10(half x);
+OVERLOADABLE half native_log(half x);
+OVERLOADABLE half native_log2(half x);
+OVERLOADABLE half native_log10(half x);
+OVERLOADABLE half native_powr(half x, half y);
+OVERLOADABLE half native_recip(half x);
+OVERLOADABLE half native_rsqrt(half x);
+OVERLOADABLE half native_sin(half x);
+OVERLOADABLE half native_sqrt(half x);
+OVERLOADABLE half native_tan(half x);
+
+// half accuracy
+OVERLOADABLE float half_cos(float x);
+OVERLOADABLE float half_divide(float x, float y);
+OVERLOADABLE float half_exp(float x);
+OVERLOADABLE float half_exp2(float x);
+OVERLOADABLE float half_exp10(float x);
+OVERLOADABLE float half_log(float x);
+OVERLOADABLE float half_log2(float x);
+OVERLOADABLE float half_log10(float x);
+OVERLOADABLE float half_powr(float x, float y);
+OVERLOADABLE float half_recip(float x);
+OVERLOADABLE float half_rsqrt(float x);
+OVERLOADABLE float half_sin(float x);
+OVERLOADABLE float half_sqrt(float x);
+OVERLOADABLE float half_tan(float x);
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 8e22015..97e33fe 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -38,17 +38,9 @@ uint get_sub_group_size(void)
/* broadcast */
#define BROADCAST_IMPL(GEN_TYPE) \
- OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id); \
- OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id) { \
+ OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, uint local_id); \
+ OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, uint local_id) { \
return __gen_ocl_sub_group_broadcast(a, local_id); \
- } \
- OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y); \
- OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y) { \
- return __gen_ocl_sub_group_broadcast(a, local_id_x, local_id_y); \
- } \
- OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z); \
- OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) { \
- return __gen_ocl_sub_group_broadcast(a, local_id_x, local_id_y, local_id_z); \
}
BROADCAST_IMPL(int)
@@ -58,8 +50,17 @@ BROADCAST_IMPL(ulong)
BROADCAST_IMPL(half)
BROADCAST_IMPL(float)
BROADCAST_IMPL(double)
+BROADCAST_IMPL(short)
+BROADCAST_IMPL(ushort)
#undef BROADCAST_IMPL
+OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id) {
+ return __gen_ocl_sub_group_broadcast(a, local_id);
+}
+
+OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id) {
+ return __gen_ocl_sub_group_broadcast(a, local_id);
+}
#define RANGE_OP(RANGE, OP, GEN_TYPE, SIGN) \
OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_##RANGE##_##OP(bool sign, GEN_TYPE x); \
@@ -75,6 +76,8 @@ RANGE_OP(reduce, add, ulong, false)
RANGE_OP(reduce, add, half, true)
RANGE_OP(reduce, add, float, true)
RANGE_OP(reduce, add, double, true)
+RANGE_OP(reduce, add, short, true)
+RANGE_OP(reduce, add, ushort, false)
/* reduce min */
RANGE_OP(reduce, min, int, true)
RANGE_OP(reduce, min, uint, false)
@@ -83,6 +86,8 @@ RANGE_OP(reduce, min, ulong, false)
RANGE_OP(reduce, min, half, true)
RANGE_OP(reduce, min, float, true)
RANGE_OP(reduce, min, double, true)
+RANGE_OP(reduce, min, short, true)
+RANGE_OP(reduce, min, ushort, false)
/* reduce max */
RANGE_OP(reduce, max, int, true)
RANGE_OP(reduce, max, uint, false)
@@ -91,6 +96,8 @@ RANGE_OP(reduce, max, ulong, false)
RANGE_OP(reduce, max, half, true)
RANGE_OP(reduce, max, float, true)
RANGE_OP(reduce, max, double, true)
+RANGE_OP(reduce, max, short, true)
+RANGE_OP(reduce, max, ushort, false)
/* scan_inclusive add */
RANGE_OP(scan_inclusive, add, int, true)
@@ -100,6 +107,8 @@ RANGE_OP(scan_inclusive, add, ulong, false)
RANGE_OP(scan_inclusive, add, half, true)
RANGE_OP(scan_inclusive, add, float, true)
RANGE_OP(scan_inclusive, add, double, true)
+RANGE_OP(scan_inclusive, add, short, true)
+RANGE_OP(scan_inclusive, add, ushort, false)
/* scan_inclusive min */
RANGE_OP(scan_inclusive, min, int, true)
RANGE_OP(scan_inclusive, min, uint, false)
@@ -108,6 +117,8 @@ RANGE_OP(scan_inclusive, min, ulong, false)
RANGE_OP(scan_inclusive, min, half, true)
RANGE_OP(scan_inclusive, min, float, true)
RANGE_OP(scan_inclusive, min, double, true)
+RANGE_OP(scan_inclusive, min, short, true)
+RANGE_OP(scan_inclusive, min, ushort, false)
/* scan_inclusive max */
RANGE_OP(scan_inclusive, max, int, true)
RANGE_OP(scan_inclusive, max, uint, false)
@@ -116,6 +127,8 @@ RANGE_OP(scan_inclusive, max, ulong, false)
RANGE_OP(scan_inclusive, max, half, true)
RANGE_OP(scan_inclusive, max, float, true)
RANGE_OP(scan_inclusive, max, double, true)
+RANGE_OP(scan_inclusive, max, short, true)
+RANGE_OP(scan_inclusive, max, ushort, false)
/* scan_exclusive add */
RANGE_OP(scan_exclusive, add, int, true)
@@ -125,6 +138,8 @@ RANGE_OP(scan_exclusive, add, ulong, false)
RANGE_OP(scan_exclusive, add, half, true)
RANGE_OP(scan_exclusive, add, float, true)
RANGE_OP(scan_exclusive, add, double, true)
+RANGE_OP(scan_exclusive, add, short, true)
+RANGE_OP(scan_exclusive, add, ushort, false)
/* scan_exclusive min */
RANGE_OP(scan_exclusive, min, int, true)
RANGE_OP(scan_exclusive, min, uint, false)
@@ -133,6 +148,8 @@ RANGE_OP(scan_exclusive, min, ulong, false)
RANGE_OP(scan_exclusive, min, half, true)
RANGE_OP(scan_exclusive, min, float, true)
RANGE_OP(scan_exclusive, min, double, true)
+RANGE_OP(scan_exclusive, min, short, true)
+RANGE_OP(scan_exclusive, min, ushort, false)
/* scan_exclusive max */
RANGE_OP(scan_exclusive, max, int, true)
RANGE_OP(scan_exclusive, max, uint, false)
@@ -141,92 +158,267 @@ RANGE_OP(scan_exclusive, max, ulong, false)
RANGE_OP(scan_exclusive, max, half, true)
RANGE_OP(scan_exclusive, max, float, true)
RANGE_OP(scan_exclusive, max, double, true)
+RANGE_OP(scan_exclusive, max, short, true)
+RANGE_OP(scan_exclusive, max, ushort, false)
#undef RANGE_OP
-PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p);
+
+#define INTEL_RANGE_OP(RANGE, OP, GEN_TYPE, SIGN) \
+ OVERLOADABLE GEN_TYPE intel_sub_group_##RANGE##_##OP(GEN_TYPE x) { \
+ return __gen_ocl_sub_group_##RANGE##_##OP(SIGN, x); \
+ }
+
+INTEL_RANGE_OP(reduce, add, short, true)
+INTEL_RANGE_OP(reduce, add, ushort, false)
+INTEL_RANGE_OP(reduce, min, short, true)
+INTEL_RANGE_OP(reduce, min, ushort, false)
+INTEL_RANGE_OP(reduce, max, short, true)
+INTEL_RANGE_OP(reduce, max, ushort, false)
+INTEL_RANGE_OP(scan_inclusive, add, short, true)
+INTEL_RANGE_OP(scan_inclusive, add, ushort, false)
+INTEL_RANGE_OP(scan_inclusive, min, short, true)
+INTEL_RANGE_OP(scan_inclusive, min, ushort, false)
+INTEL_RANGE_OP(scan_inclusive, max, short, true)
+INTEL_RANGE_OP(scan_inclusive, max, ushort, false)
+INTEL_RANGE_OP(scan_exclusive, add, short, true)
+INTEL_RANGE_OP(scan_exclusive, add, ushort, false)
+INTEL_RANGE_OP(scan_exclusive, min, short, true)
+INTEL_RANGE_OP(scan_exclusive, min, ushort, false)
+INTEL_RANGE_OP(scan_exclusive, max, short, true)
+INTEL_RANGE_OP(scan_exclusive, max, ushort, false)
+
+#undef INTEL_RANGE_OP
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_mem(const global uint* p);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_mem2(const global uint* p);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_mem4(const global uint* p);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_mem8(const global uint* p);
OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem(p);
+ return __gen_ocl_sub_group_block_read_ui_mem(p);
}
OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem2(p);
+ return __gen_ocl_sub_group_block_read_ui_mem2(p);
}
OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem4(p);
-
+ return __gen_ocl_sub_group_block_read_ui_mem4(p);
}
OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem8(p);
+ return __gen_ocl_sub_group_block_read_ui_mem8(p);
+}
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p)
+{
+ return __gen_ocl_sub_group_block_read_ui_mem(p);
}
-void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data);
-void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data);
-void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data);
-void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data);
-OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data)
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p)
{
- __gen_ocl_sub_group_block_write_mem(p, data);
+ return __gen_ocl_sub_group_block_read_ui_mem2(p);
}
-OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data)
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p)
{
- __gen_ocl_sub_group_block_write_mem2(p, data);
+ return __gen_ocl_sub_group_block_read_ui_mem4(p);
}
-OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p)
{
- __gen_ocl_sub_group_block_write_mem4(p, data);
+ return __gen_ocl_sub_group_block_read_ui_mem8(p);
+}
+void __gen_ocl_sub_group_block_write_ui_mem(global uint* p, uint data);
+void __gen_ocl_sub_group_block_write_ui_mem2(global uint* p, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_mem4(global uint* p, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_mem8(global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(global uint* p, uint data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write2(global uint* p, uint2 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem2(p, data);
}
-OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
+OVERLOADABLE void intel_sub_group_block_write4(global uint* p,uint4 data)
{
- __gen_ocl_sub_group_block_write_mem8(p, data);
+ __gen_ocl_sub_group_block_write_ui_mem4(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write8(global uint* p,uint8 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem8(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui(global uint* p, uint data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui2(global uint* p, uint2 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui4(global uint* p,uint4 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem4(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui8(global uint* p,uint8 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem8(p, data);
}
-PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int y);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int y);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int y);
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y);
OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
}
OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
}
OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
}
OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
}
-void __gen_ocl_sub_group_block_write_image(image2d_t p, int x, int y, uint data);
-void __gen_ocl_sub_group_block_write_image2(image2d_t p, int x, int y, uint2 data);
-void __gen_ocl_sub_group_block_write_image4(image2d_t p, int x, int y, uint4 data);
-void __gen_ocl_sub_group_block_write_image8(image2d_t p, int x, int y, uint8 data);
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
+}
+
+void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data);
+void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, uint8 data);
OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint data)
{
- __gen_ocl_sub_group_block_write_image(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
}
OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 data)
{
- __gen_ocl_sub_group_block_write_image2(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
}
OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 data)
{
- __gen_ocl_sub_group_block_write_image4(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
}
OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 data)
{
- __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t p, int2 cord, uint data)
+{
+ __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t p, int2 cord, uint2 data)
+{
+ __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t p, int2 cord, uint4 data)
+{
+ __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t p, int2 cord, uint8 data)
+{
+ __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+}
+
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_mem(const global ushort* p);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_mem2(const global ushort* p);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_mem4(const global ushort* p);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_mem8(const global ushort* p);
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem(p);
+}
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem2(p);
+}
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem4(p);
+}
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem8(p);
+}
+
+void __gen_ocl_sub_group_block_write_us_mem(global ushort* p, ushort data);
+void __gen_ocl_sub_group_block_write_us_mem2(global ushort* p, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_mem4(global ushort* p, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_mem8(global ushort* p, ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(global ushort* p, ushort data)
+{
+ __gen_ocl_sub_group_block_write_us_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us2(global ushort* p, ushort2 data)
+{
+ __gen_ocl_sub_group_block_write_us_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us4(global ushort* p,ushort4 data)
+{
+ __gen_ocl_sub_group_block_write_us_mem4(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us8(global ushort* p,ushort8 data)
+{
+ __gen_ocl_sub_group_block_write_us_mem8(p, data);
}
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y);
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y);
+}
+
+void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data);
+void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t p, int2 cord, ushort data)
+{
+ __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t p, int2 cord, ushort2 data)
+{
+ __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t p, int2 cord, ushort4 data)
+{
+ __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort8 data)
+{
+ __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data);
+}
#define SHUFFLE_DOWN(TYPE) \
OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
TYPE res0, res1; \
@@ -238,6 +430,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
SHUFFLE_DOWN(float)
SHUFFLE_DOWN(int)
SHUFFLE_DOWN(uint)
+SHUFFLE_DOWN(short)
+SHUFFLE_DOWN(ushort)
#undef SHUFFLE_DOWN
#define SHUFFLE_UP(TYPE) \
@@ -251,6 +445,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \
SHUFFLE_UP(float)
SHUFFLE_UP(int)
SHUFFLE_UP(uint)
+SHUFFLE_UP(short)
+SHUFFLE_UP(ushort)
#undef SHUFFLE_UP
#define SHUFFLE_XOR(TYPE) \
OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
@@ -259,4 +455,6 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
SHUFFLE_XOR(float)
SHUFFLE_XOR(int)
SHUFFLE_XOR(uint)
+SHUFFLE_XOR(short)
+SHUFFLE_XOR(ushort)
#undef SHUFFLE_XOR
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index ae3b379..608551b 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -35,30 +35,18 @@ uint get_sub_group_id(void);
uint get_sub_group_local_id(void);
/* broadcast */
-OVERLOADABLE int sub_group_broadcast(int a, size_t local_id);
-OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id);
-OVERLOADABLE long sub_group_broadcast(long a, size_t local_id);
-OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id);
-OVERLOADABLE half sub_group_broadcast(half a, size_t local_id);
-OVERLOADABLE float sub_group_broadcast(float a, size_t local_id);
-OVERLOADABLE double sub_group_broadcast(double a, size_t local_id);
-
-OVERLOADABLE int sub_group_broadcast(int a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE long sub_group_broadcast(long a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE half sub_group_broadcast(half a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE float sub_group_broadcast(float a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE double sub_group_broadcast(double a, size_t local_id_x, size_t local_id_y);
-
-OVERLOADABLE int sub_group_broadcast(int a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE long sub_group_broadcast(long a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE half sub_group_broadcast(half a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE float sub_group_broadcast(float a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE double sub_group_broadcast(double a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-
+OVERLOADABLE int sub_group_broadcast(int a,uint local_id);
+OVERLOADABLE uint sub_group_broadcast(uint a, uint local_id);
+OVERLOADABLE long sub_group_broadcast(long a, uint local_id);
+OVERLOADABLE ulong sub_group_broadcast(ulong a, uint local_id);
+OVERLOADABLE half sub_group_broadcast(half a, uint local_id);
+OVERLOADABLE float sub_group_broadcast(float a, uint local_id);
+OVERLOADABLE double sub_group_broadcast(double a, uint local_id);
+OVERLOADABLE short sub_group_broadcast(short a,uint local_id);
+OVERLOADABLE ushort sub_group_broadcast(ushort a, uint local_id);
+
+OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id);
+OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id);
/* reduce add */
OVERLOADABLE int sub_group_reduce_add(int x);
OVERLOADABLE uint sub_group_reduce_add(uint x);
@@ -67,6 +55,10 @@ OVERLOADABLE ulong sub_group_reduce_add(ulong x);
OVERLOADABLE half sub_group_reduce_add(half x);
OVERLOADABLE float sub_group_reduce_add(float x);
OVERLOADABLE double sub_group_reduce_add(double x);
+OVERLOADABLE short sub_group_reduce_add(short x);
+OVERLOADABLE ushort sub_group_reduce_add(ushort x);
+OVERLOADABLE short intel_sug_group_reduce_add(short x);
+OVERLOADABLE ushort intel_sug_group_reduce_add(ushort x);
/* reduce min */
OVERLOADABLE int sub_group_reduce_min(int x);
@@ -76,6 +68,10 @@ OVERLOADABLE ulong sub_group_reduce_min(ulong x);
OVERLOADABLE half sub_group_reduce_min(half x);
OVERLOADABLE float sub_group_reduce_min(float x);
OVERLOADABLE double sub_group_reduce_min(double x);
+OVERLOADABLE short sub_group_reduce_min(short x);
+OVERLOADABLE ushort sub_group_reduce_min(ushort x);
+OVERLOADABLE short intel_sug_group_reduce_min(short x);
+OVERLOADABLE ushort intel_sug_group_reduce_min(ushort x);
/* reduce max */
OVERLOADABLE int sub_group_reduce_max(int x);
@@ -85,6 +81,10 @@ OVERLOADABLE ulong sub_group_reduce_max(ulong x);
OVERLOADABLE half sub_group_reduce_max(half x);
OVERLOADABLE float sub_group_reduce_max(float x);
OVERLOADABLE double sub_group_reduce_max(double x);
+OVERLOADABLE short sub_group_reduce_max(short x);
+OVERLOADABLE ushort sub_group_reduce_max(ushort x);
+OVERLOADABLE short intel_sug_group_reduce_max(short x);
+OVERLOADABLE ushort intel_sug_group_reduce_max(ushort x);
/* scan_inclusive add */
OVERLOADABLE int sub_group_scan_inclusive_add(int x);
@@ -94,6 +94,10 @@ OVERLOADABLE ulong sub_group_scan_inclusive_add(ulong x);
OVERLOADABLE half sub_group_scan_inclusive_add(half x);
OVERLOADABLE float sub_group_scan_inclusive_add(float x);
OVERLOADABLE double sub_group_scan_inclusive_add(double x);
+OVERLOADABLE short sub_group_scan_inclusive_add(short x);
+OVERLOADABLE ushort sub_group_scan_inclusive_add(ushort x);
+OVERLOADABLE short intel_sug_group_scan_inclusive_add(short x);
+OVERLOADABLE ushort intel_sug_group_scan_inclusive_add(ushort x);
/* scan_inclusive min */
OVERLOADABLE int sub_group_scan_inclusive_min(int x);
@@ -103,6 +107,10 @@ OVERLOADABLE ulong sub_group_scan_inclusive_min(ulong x);
OVERLOADABLE half sub_group_scan_inclusive_min(half x);
OVERLOADABLE float sub_group_scan_inclusive_min(float x);
OVERLOADABLE double sub_group_scan_inclusive_min(double x);
+OVERLOADABLE short sub_group_scan_inclusive_min(short x);
+OVERLOADABLE ushort sub_group_scan_inclusive_min(ushort x);
+OVERLOADABLE short intel_sug_group_scan_inclusive_min(short x);
+OVERLOADABLE ushort intel_sug_group_scan_inclusive_min(ushort x);
/* scan_inclusive max */
OVERLOADABLE int sub_group_scan_inclusive_max(int x);
@@ -112,6 +120,10 @@ OVERLOADABLE ulong sub_group_scan_inclusive_max(ulong x);
OVERLOADABLE half sub_group_scan_inclusive_max(half x);
OVERLOADABLE float sub_group_scan_inclusive_max(float x);
OVERLOADABLE double sub_group_scan_inclusive_max(double x);
+OVERLOADABLE short sub_group_scan_inclusive_max(short x);
+OVERLOADABLE ushort sub_group_scan_inclusive_max(ushort x);
+OVERLOADABLE short intel_sug_group_scan_inclusive_max(short x);
+OVERLOADABLE ushort intel_sug_group_scan_inclusive_max(ushort x);
/* scan_exclusive add */
OVERLOADABLE int sub_group_scan_exclusive_add(int x);
@@ -121,6 +133,10 @@ OVERLOADABLE ulong sub_group_scan_exclusive_add(ulong x);
OVERLOADABLE half sub_group_scan_exclusive_add(half x);
OVERLOADABLE float sub_group_scan_exclusive_add(float x);
OVERLOADABLE double sub_group_scan_exclusive_add(double x);
+OVERLOADABLE short sub_group_scan_exclusive_add(short x);
+OVERLOADABLE ushort sub_group_scan_exclusive_add(ushort x);
+OVERLOADABLE short intel_sub_group_scan_exclusive_add(short x);
+OVERLOADABLE ushort intel_sub_group_scan_exclusive_add(ushort x);
/* scan_exclusive min */
OVERLOADABLE int sub_group_scan_exclusive_min(int x);
@@ -130,6 +146,10 @@ OVERLOADABLE ulong sub_group_scan_exclusive_min(ulong x);
OVERLOADABLE half sub_group_scan_exclusive_min(half x);
OVERLOADABLE float sub_group_scan_exclusive_min(float x);
OVERLOADABLE double sub_group_scan_exclusive_min(double x);
+OVERLOADABLE short sub_group_scan_exclusive_min(short x);
+OVERLOADABLE ushort sub_group_scan_exclusive_min(ushort x);
+OVERLOADABLE short intel_sug_group_scan_exclusive_min(short x);
+OVERLOADABLE ushort intel_sug_group_scan_exclusive_min(ushort x);
/* scan_exclusive max */
OVERLOADABLE int sub_group_scan_exclusive_max(int x);
@@ -139,21 +159,36 @@ OVERLOADABLE ulong sub_group_scan_exclusive_max(ulong x);
OVERLOADABLE half sub_group_scan_exclusive_max(half x);
OVERLOADABLE float sub_group_scan_exclusive_max(float x);
OVERLOADABLE double sub_group_scan_exclusive_max(double x);
+OVERLOADABLE short sub_group_scan_exclusive_max(short x);
+OVERLOADABLE ushort sub_group_scan_exclusive_max(ushort x);
+OVERLOADABLE short intel_sug_group_scan_exclusive_max(short x);
+OVERLOADABLE ushort intel_sug_group_scan_exclusive_max(ushort x);
/* shuffle */
OVERLOADABLE half intel_sub_group_shuffle(half x, uint c);
OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE short intel_sub_group_shuffle(short x, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle(ushort x, uint c);
+
OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_down(short x, short y, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_down(ushort x, ushort y, uint c);
+
OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_up(short x, short y, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_up(ushort x, ushort y, uint c);
+
OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_xor(short x, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_xor(ushort x, uint c);
/* blocak read/write */
OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
@@ -161,10 +196,10 @@ OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
-OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data);
-OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
-OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
-OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(__global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(__global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(__global uint* p, uint8 data);
OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord);
OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord);
@@ -175,3 +210,43 @@ OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord,
OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data);
OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data);
OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data);
+
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p);
+
+OVERLOADABLE void intel_sub_group_block_write_ui(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(__global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(__global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(__global uint* p, uint8 data);
+
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t image, int2 byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t image, int2 byte_coord, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t image, int2 byte_coord, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t image, int2 byte_coord, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t image, int2 byte_coord, uint8 data);
+
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p);
+
+OVERLOADABLE void intel_sub_group_block_write_us(__global ushort* p, ushort data);
+OVERLOADABLE void intel_sub_group_block_write_us2(__global ushort* p, ushort2 data);
+OVERLOADABLE void intel_sub_group_block_write_us4(__global ushort* p, ushort4 data);
+OVERLOADABLE void intel_sub_group_block_write_us8(__global ushort* p, ushort8 data);
+
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t image, int2 byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t image, int2 byte_coord, ushort data);
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coord, ushort2 data);
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data);
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data);
diff --git a/backend/src/llvm/ExpandLargeIntegers.cpp b/backend/src/llvm/ExpandLargeIntegers.cpp
index 1ee294f..60740f5 100644
--- a/backend/src/llvm/ExpandLargeIntegers.cpp
+++ b/backend/src/llvm/ExpandLargeIntegers.cpp
@@ -324,12 +324,14 @@ static Value *buildVectorOrScalar(ConversionState &State, IRBuilder<> &IRB, Smal
Value * vec = NULL;
unsigned ElemNo = Elements.size();
Type *ElemTy = Elements[0]->getType();
+ // if it is illegal integer type, these instructions will be further
+ // splited, that's why these temporary values should be erased.
bool KeepInsert = isLegalBitSize(ElemTy->getPrimitiveSizeInBits() * ElemNo);
for (unsigned i = 0; i < ElemNo; ++i) {
Value *tmp = vec ? vec : UndefValue::get(VectorType::get(ElemTy, ElemNo));
Value *idx = ConstantInt::get(IntTy, i);
vec = IRB.CreateInsertElement(tmp, Elements[i], idx);
- if (!KeepInsert) {
+ if (!KeepInsert && !isa<Constant>(vec)) {
State.addEraseCandidate(cast<Instruction>(vec));
}
}
@@ -436,6 +438,7 @@ static void convertInstruction(Instruction *Inst, ConversionState &State,
State.recordConverted(Trunc, NewInst);
} else {
TypePair Tys = getExpandedIntTypes(Trunc->getType());
+ (void) OpTys;
assert(Tys.Lo == OpTys.Lo);
Value *Lo = Ops.Lo;
Value *Hi = IRB.CreateTrunc(Ops.Hi, Tys.Hi, Twine(Name, ".hi"));
diff --git a/backend/src/llvm/PromoteIntegers.cpp b/backend/src/llvm/PromoteIntegers.cpp
index adba004..a500311 100644
--- a/backend/src/llvm/PromoteIntegers.cpp
+++ b/backend/src/llvm/PromoteIntegers.cpp
@@ -151,6 +151,7 @@ static Value *convertConstant(Constant *C, bool SignExt=false) {
} else {
errs() << "Value: " << *C << "\n";
report_fatal_error("Unexpected constant value");
+ return NULL;
}
}
diff --git a/backend/src/llvm/StripAttributes.cpp b/backend/src/llvm/StripAttributes.cpp
index 3bf3853..9d07c29 100644
--- a/backend/src/llvm/StripAttributes.cpp
+++ b/backend/src/llvm/StripAttributes.cpp
@@ -89,10 +89,12 @@ namespace {
char StripAttributes::ID = 0;
bool StripAttributes::runOnFunction(Function &Func) {
- if (!gbe::isKernelFunction(Func))
- Func.addFnAttr(Attribute::AlwaysInline);
Func.setCallingConv(CallingConv::C);
Func.setLinkage(GlobalValue::ExternalLinkage);
+ if (!gbe::isKernelFunction(Func)) {
+ Func.addFnAttr(Attribute::AlwaysInline);
+ Func.setLinkage(GlobalValue::LinkOnceAnyLinkage);
+ }
for (Function::iterator BB = Func.begin(), E = Func.end();
BB != E; ++BB) {
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index a3f9886..89d5e7c 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -26,18 +26,23 @@
#include "src/GBEConfig.h"
#include "llvm_includes.hpp"
#include "llvm/llvm_gen_backend.hpp"
+#include "ir/unit.hpp"
using namespace llvm;
SVAR(OCL_BITCODE_LIB_PATH, OCL_BITCODE_BIN);
+SVAR(OCL_BITCODE_LIB_20_PATH, OCL_BITCODE_BIN_20);
namespace gbe
{
- static Module* createOclBitCodeModule(LLVMContext& ctx, bool strictMath)
+ static Module* createOclBitCodeModule(LLVMContext& ctx,
+ bool strictMath,
+ uint32_t oclVersion)
{
- std::string bitCodeFiles = OCL_BITCODE_LIB_PATH;
+ std::string bitCodeFiles = oclVersion >= 200 ?
+ OCL_BITCODE_LIB_20_PATH : OCL_BITCODE_LIB_PATH;
if(bitCodeFiles == "")
- bitCodeFiles = OCL_BITCODE_BIN;
+ bitCodeFiles = oclVersion >= 200 ? OCL_BITCODE_BIN_20 : OCL_BITCODE_BIN;
std::istringstream bitCodeFilePath(bitCodeFiles);
std::string FilePath;
bool findBC = false;
@@ -86,11 +91,11 @@ namespace gbe
}
llvm::Function * callFunc = call->getCalledFunction();
- if(!callFunc) {
- continue;
- }
+ //if(!callFunc) {
+ // continue;
+ //}
- if (callFunc->getIntrinsicID() != 0)
+ if (callFunc && callFunc->getIntrinsicID() != 0)
continue;
std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
@@ -135,12 +140,16 @@ namespace gbe
}
- Module* runBitCodeLinker(Module *mod, bool strictMath)
+ Module* runBitCodeLinker(Module *mod, bool strictMath, ir::Unit &unit)
{
LLVMContext& ctx = mod->getContext();
std::set<std::string> materializedFuncs;
std::vector<GlobalValue *> Gvs;
- Module* clonedLib = createOclBitCodeModule(ctx, strictMath);
+
+ uint32_t oclVersion = getModuleOclVersion(mod);
+ ir::PointerSize size = oclVersion >= 200 ? ir::POINTER_64_BITS : ir::POINTER_32_BITS;
+ unit.setPointerSize(size);
+ Module* clonedLib = createOclBitCodeModule(ctx, strictMath, oclVersion);
if (clonedLib == NULL)
return NULL;
@@ -182,6 +191,28 @@ namespace gbe
builtinFuncs.push_back("__gen_memcpy_gc_align");
builtinFuncs.push_back("__gen_memcpy_lc_align");
+ if (oclVersion >= 200) {
+ builtinFuncs.push_back("__gen_memcpy_gn");
+ builtinFuncs.push_back("__gen_memcpy_pn");
+ builtinFuncs.push_back("__gen_memcpy_ln");
+ builtinFuncs.push_back("__gen_memcpy_ng");
+ builtinFuncs.push_back("__gen_memcpy_np");
+ builtinFuncs.push_back("__gen_memcpy_nl");
+ builtinFuncs.push_back("__gen_memcpy_nc");
+ builtinFuncs.push_back("__gen_memcpy_nn");
+ builtinFuncs.push_back("__gen_memset_n");
+
+ builtinFuncs.push_back("__gen_memcpy_gn_align");
+ builtinFuncs.push_back("__gen_memcpy_pn_align");
+ builtinFuncs.push_back("__gen_memcpy_ln_align");
+ builtinFuncs.push_back("__gen_memcpy_ng_align");
+ builtinFuncs.push_back("__gen_memcpy_np_align");
+ builtinFuncs.push_back("__gen_memcpy_nl_align");
+ builtinFuncs.push_back("__gen_memcpy_nc_align");
+ builtinFuncs.push_back("__gen_memcpy_nn_align");
+ builtinFuncs.push_back("__gen_memset_n_align");
+ }
+
for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
if (SF->isDeclaration()) continue;
if (!isKernelFunction(*SF)) continue;
diff --git a/backend/src/llvm/llvm_device_enqueue.cpp b/backend/src/llvm/llvm_device_enqueue.cpp
new file mode 100644
index 0000000..ee236de
--- /dev/null
+++ b/backend/src/llvm/llvm_device_enqueue.cpp
@@ -0,0 +1,417 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <list>
+#include "llvm_includes.hpp"
+
+#include "ir/unit.hpp"
+#include "llvm_gen_backend.hpp"
+#include "ocl_common_defines.h"
+
+using namespace llvm;
+
+namespace gbe {
+ BitCastInst *isInvokeBitcast(Instruction *I) {
+ BitCastInst* bt = dyn_cast<BitCastInst>(I);
+ if (bt == NULL)
+ return NULL;
+
+ Type* type = bt->getOperand(0)->getType();
+ if(!type->isPointerTy())
+ return NULL;
+
+ PointerType *pointerType = dyn_cast<PointerType>(type);
+ Type *pointed = pointerType->getElementType();
+ if(!pointed->isFunctionTy())
+ return NULL;
+
+ Function *Fn = dyn_cast<Function>(bt->getOperand(0));
+ if(Fn == NULL)
+ return NULL;
+
+ /* This is a fake, to check the function bitcast is for block or not */
+ std::string fnName = Fn->getName();
+ if(fnName.find("_invoke") == std::string::npos)
+ return NULL;
+
+ return bt;
+ }
+
+ void mutateArgAddressSpace(Argument *arg)
+ {
+ std::list<Value *>WorkList;
+ WorkList.push_back(arg);
+
+ while(!WorkList.empty()) {
+ Value *v = WorkList.front();
+
+ for (Value::use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
+ // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
+ // which is more straightforward.
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+ User *theUser = *iter;
+#else
+ User *theUser = iter->getUser();
+#endif
+ // becareful with sub operation
+ if (isa<StoreInst>(theUser) || isa<LoadInst>(theUser))
+ continue;
+
+ WorkList.push_back(theUser);
+ }
+
+ PointerType *ty = dyn_cast<PointerType>(v->getType());
+ if(ty == NULL) continue; //should only one argument, private pointer type
+ ty = PointerType::get(ty->getPointerElementType(), 1);
+ v->mutateType(ty);
+ WorkList.pop_front();
+ }
+ }
+
+ Function* setFunctionAsKernel(Module *mod, Function *Fn)
+ {
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 9)
+ LLVMContext &Context = mod->getContext();
+ Type *intTy = IntegerType::get(mod->getContext(), 32);
+ SmallVector<llvm::Metadata *, 5> kernelMDArgs;
+
+ // MDNode for the kernel argument address space qualifiers.
+ SmallVector<llvm::Metadata *, 8> addressQuals;
+
+ // MDNode for the kernel argument access qualifiers (images only).
+ SmallVector<llvm::Metadata *, 8> accessQuals;
+
+ // MDNode for the kernel argument type names.
+ SmallVector<llvm::Metadata *, 8> argTypeNames;
+
+ // MDNode for the kernel argument base type names.
+ SmallVector<llvm::Metadata *, 8> argBaseTypeNames;
+
+ // MDNode for the kernel argument type qualifiers.
+ SmallVector<llvm::Metadata *, 8> argTypeQuals;
+
+ // MDNode for the kernel argument names.
+ SmallVector<llvm::Metadata *, 8> argNames;
+
+ //Because paramter type changed, so must re-create the invoke function and replace the old one
+ std::vector<Type *> ParamTys;
+ ValueToValueMapTy VMap;
+ for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; ++I) {
+ PointerType *ty = dyn_cast<PointerType>(I->getType());
+ if(ty && ty->getAddressSpace() == 0) //Foce set the address space to global
+ ty = PointerType::get(ty->getPointerElementType(), 1);
+ ParamTys.push_back(ty);
+ }
+ FunctionType* NewFT = FunctionType::get(Fn->getReturnType(), ParamTys, false);
+ Function* NewFn = Function::Create(NewFT, Function::ExternalLinkage, Fn->getName());
+ SmallVector<ReturnInst*, 8> Returns;
+
+ Function::arg_iterator NewFnArgIt = NewFn->arg_begin();
+ for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; ++I) {
+ std::string ArgName = I->getName();
+ NewFnArgIt->setName(ArgName);
+ VMap[&*I] = &(*NewFnArgIt++);
+ }
+ CloneFunctionInto(NewFn, Fn, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+ Fn->setName("__d" + Fn->getName());
+ mod->getFunctionList().push_back(NewFn);
+ //mod->getOrInsertFunction(NewFn->getName(), NewFn->getFunctionType(),
+ // NewFn->getAttributes());
+
+ for (Function::arg_iterator I = NewFn->arg_begin(), E = NewFn->arg_end(); I != E; ++I) {
+ PointerType *ty = dyn_cast<PointerType>(I->getType());
+ //mutate the address space of all pointer derive from the argmument from private to global
+ if(ty && ty->getAddressSpace() == 1)
+ mutateArgAddressSpace(&*I);
+ //ty = dyn_cast<PointerType>(I->getType());
+
+ addressQuals.push_back(llvm::ConstantAsMetadata::get(ConstantInt::get(intTy, ty->getAddressSpace())));
+ accessQuals.push_back(llvm::MDString::get(Context, "none"));
+ argTypeNames.push_back(llvm::MDString::get(Context, "char*"));
+ argBaseTypeNames.push_back(llvm::MDString::get(Context, "char*"));
+ argTypeQuals.push_back(llvm::MDString::get(Context, ""));
+ argNames.push_back(llvm::MDString::get(Context, I->getName()));
+ }
+
+ //If run to here, llvm version always > 3.9, add the version check just for build.
+ NewFn->setMetadata("kernel_arg_addr_space",
+ llvm::MDNode::get(Context, addressQuals));
+ NewFn->setMetadata("kernel_arg_access_qual",
+ llvm::MDNode::get(Context, accessQuals));
+ NewFn->setMetadata("kernel_arg_type",
+ llvm::MDNode::get(Context, argTypeNames));
+ NewFn->setMetadata("kernel_arg_base_type",
+ llvm::MDNode::get(Context, argBaseTypeNames));
+ NewFn->setMetadata("kernel_arg_type_qual",
+ llvm::MDNode::get(Context, argTypeQuals));
+ NewFn->setMetadata("kernel_arg_name",
+ llvm::MDNode::get(Context, argNames));
+ return NewFn;
+#else
+ assert(0); //only opencl 2.0 could reach hear.
+ return Fn;
+#endif
+ }
+
+ Instruction* replaceInst(Instruction *I, Value *v)
+ {
+ //The bitcast is instruction
+ if(BitCastInst *bt = dyn_cast<BitCastInst>(&*I)) {
+ bt->replaceAllUsesWith(v);
+ return bt;
+ }
+ return NULL;
+ }
+
+ void collectDeviceEnqueueInfo(Module *mod, ir::Unit &unit)
+ {
+ std::set<Instruction*> deadInsnSet;
+ std::set<Function*> deadFunctionSet;
+ std::map<Value*, std::string> blocks;
+ if (getModuleOclVersion(mod) < 200)
+ return;
+
+ for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
+ Function *f = &*SF;
+ if (f->isDeclaration()) continue;
+
+ for (inst_iterator I = inst_begin(f), E = inst_end(f); I != E; ++I) {
+ if (BitCastInst* bt = isInvokeBitcast(&*I)) {
+ /* handle block description, convert the instruction that store block
+ * invoke pointer to store the index in the unit's block functions index.*/
+ Function *Fn = dyn_cast<Function>(bt->getOperand(0));
+
+ std::string fnName = Fn->getName();
+ int index = -1;
+ for(size_t i=0; i<unit.blockFuncs.size(); i++) {
+ if(unit.blockFuncs[i] == fnName) {
+ index = i;
+ break;
+ }
+ }
+ if(index == -1){
+ unit.blockFuncs.push_back(fnName);
+ index = unit.blockFuncs.size() - 1;
+ }
+
+ for (Value::use_iterator iter = bt->use_begin(); iter != bt->use_end(); ++iter) {
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+ User *theUser = *iter;
+#else
+ User *theUser = iter->getUser();
+#endif
+ if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
+ GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(st->getPointerOperand());
+ if(gep)
+ blocks[gep->getOperand(0)] = fnName;
+ }
+ }
+
+ if(StoreInst* st = dyn_cast<StoreInst>(&*I)) {
+ GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(st->getPointerOperand());
+ if(gep)
+ blocks[gep->getOperand(0)] = fnName;
+ }
+
+ Value *v = Constant::getIntegerValue(bt->getType(), APInt(unit.getPointerSize(), index));
+ bt->replaceAllUsesWith(v);
+ deadInsnSet.insert(bt);
+ }
+
+ if(CallInst *CI = dyn_cast<CallInst>(&*I)) {
+ IRBuilder<> builder(CI->getParent(), BasicBlock::iterator(CI));
+ if(CI->getCalledFunction() == NULL) {
+ //unnamed call function, parse the use to find the define of called function
+ SmallVector<Value*, 16> args(CI->op_begin(), CI->op_end()-1);
+
+ Value *v = CI->getCalledValue();
+ BitCastInst* bt = dyn_cast<BitCastInst>(v);
+ if(bt == NULL)
+ continue;
+
+ LoadInst* ld = dyn_cast<LoadInst>(bt->getOperand(0));
+ if(ld == NULL)
+ continue;
+
+ GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(ld->getPointerOperand());
+ if(gep == NULL)
+ continue;
+
+ BitCastInst* fnPointer = dyn_cast<BitCastInst>(gep->getOperand(0));
+ if(fnPointer == NULL)
+ continue;
+
+ if(BitCastInst* bt = dyn_cast<BitCastInst>(fnPointer->getOperand(0))) {
+ std::string fnName = blocks[bt->getOperand(0)];
+ Function* f = mod->getFunction(fnName);
+ CallInst *newCI = builder.CreateCall(f, args);
+ CI->replaceAllUsesWith(newCI);
+ deadInsnSet.insert(CI);
+ continue;
+ }
+
+ //the function is global variable
+ if(GlobalVariable* gv = dyn_cast<GlobalVariable>(fnPointer->getOperand(0))) {
+ Constant *c = gv->getInitializer();
+ ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
+ BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+ Function* f = dyn_cast<Function>(bt->getOperand(0));
+ CallInst *newCI = builder.CreateCall(f, args);
+ CI->replaceAllUsesWith(newCI);
+ deadInsnSet.insert(CI);
+ continue;
+ }
+
+ ld = dyn_cast<LoadInst>(fnPointer->getOperand(0));
+ if(ld == NULL)
+ continue;
+
+ if(GlobalVariable *gv = dyn_cast<GlobalVariable>(ld->getPointerOperand())) {
+ ConstantExpr *expr = dyn_cast<ConstantExpr>(gv->getInitializer());
+ BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+ GlobalVariable *block_literal = dyn_cast<GlobalVariable>(bt->getOperand(0));
+ Constant *v = block_literal->getInitializer();
+ expr = dyn_cast<ConstantExpr>(v->getOperand(3));
+ bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+ Function* f = dyn_cast<Function>(bt->getOperand(0));
+ CallInst *newCI = builder.CreateCall(f, args);
+ CI->replaceAllUsesWith(newCI);
+ deadInsnSet.insert(CI);
+ continue;
+ }
+
+ if(AllocaInst *ai = dyn_cast<AllocaInst>(ld->getPointerOperand())) {
+ Value *v = NULL;
+ for (Value::use_iterator iter = ai->use_begin(); iter != ai->use_end(); ++iter) {
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+ User *theUser = *iter;
+#else
+ User *theUser = iter->getUser();
+#endif
+ if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
+ bt = dyn_cast<BitCastInst>(st->getValueOperand());
+ if(bt)
+ v = bt->getOperand(0);
+ }
+ }
+ if(blocks.find(v) == blocks.end()) {
+ if(GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+ Constant *c = gv->getInitializer();
+ ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
+ BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+ Function* f = dyn_cast<Function>(bt->getOperand(0));
+ blocks[v] = f->getName();
+ }
+ }
+
+ std::string fnName = blocks[v];
+ Function* f = mod->getFunction(fnName);
+ CallInst *newCI = builder.CreateCall(f, args);
+ CI->replaceAllUsesWith(newCI);
+ deadInsnSet.insert(CI);
+ continue;
+ }
+ //can't find the function's define
+ assert(0);
+ } else {
+ //handle enqueue_kernel function call
+ Function *fn = CI->getCalledFunction();
+ if (fn->getName().find("enqueue_kernel") == std::string::npos)
+ continue;
+
+ //block parameter's index, 3 or 6
+ int block_index = 3;
+ Type *type = CI->getArgOperand(block_index)->getType();
+ if(type->isIntegerTy())
+ block_index = 6;
+ Value *block = CI->getArgOperand(block_index);
+ while(isa<BitCastInst>(block))
+ block = dyn_cast<BitCastInst>(block)->getOperand(0);
+ LoadInst *ld = dyn_cast<LoadInst>(block);
+ Value *v = NULL;
+ if(ld) {
+ Value *block = ld->getPointerOperand();
+ for (Value::use_iterator iter = block->use_begin(); iter != block->use_end(); ++iter) {
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+ User *theUser = *iter;
+#else
+ User *theUser = iter->getUser();
+#endif
+ if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
+ BitCastInst *bt = dyn_cast<BitCastInst>(st->getValueOperand());
+ if(bt)
+ v = bt->getOperand(0);
+ }
+ }
+ if(blocks.find(v) == blocks.end()) {
+ if(GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+ Constant *c = gv->getInitializer();
+ ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
+ BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+ Function* f = dyn_cast<Function>(bt->getOperand(0));
+ blocks[v] = f->getName();
+ }
+ }
+ } else if(isa<AllocaInst>(block)) {
+ v = block;
+ }
+ std::string fnName = blocks[v];
+ Function* f = mod->getFunction(fnName);
+ deadFunctionSet.insert(f);
+ f = setFunctionAsKernel(mod, f);
+
+ if( fn->isVarArg() ) {
+ //enqueue function with slm, convert to __gen_enqueue_kernel_slm call
+ //store the slm information to a alloca address.
+ int start = block_index + 1;
+ int count = CI->getNumArgOperands() - start;
+ Type *intTy = IntegerType::get(mod->getContext(), 32);
+
+ AllocaInst *AI = builder.CreateAlloca(intTy, ConstantInt::get(intTy, count));
+
+ for(uint32_t i = start; i < CI->getNumArgOperands(); i++) {
+ Value *ptr = builder.CreateGEP(AI, ConstantInt::get(intTy, i-start));
+ builder.CreateStore(CI->getArgOperand(i), ptr);
+ }
+ SmallVector<Value*, 16> args(CI->op_begin(), CI->op_begin() + 3);
+ args.push_back(CI->getArgOperand(block_index));
+ args.push_back(ConstantInt::get(intTy, count));
+ args.push_back(AI);
+
+ std::vector<Type *> ParamTys;
+ for (Value** I = args.begin(); I != args.end(); ++I)
+ ParamTys.push_back((*I)->getType());
+ CallInst* newCI = builder.CreateCall(cast<llvm::Function>(mod->getOrInsertFunction(
+ "__gen_enqueue_kernel_slm", FunctionType::get(intTy, ParamTys, false))), args);
+ CI->replaceAllUsesWith(newCI);
+ deadInsnSet.insert(CI);
+ }
+ }
+ }
+ }
+ }
+
+ for (auto it: deadInsnSet) {
+ it->eraseFromParent();
+ }
+
+ for (auto it: deadFunctionSet) {
+ it->eraseFromParent();
+ }
+ }
+};
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 0570766..664d2ff 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -254,6 +254,7 @@ namespace gbe
case 1: return ir::MEM_GLOBAL;
case 2: return ir::MEM_CONSTANT;
case 3: return ir::MEM_LOCAL;
+ case 4: return ir::MEM_GENERIC;
}
GBE_ASSERT(false);
return ir::MEM_GLOBAL;
@@ -280,6 +281,38 @@ namespace gbe
return CPV;
}
+#define TYPESIZE(TYPE,VECT,SZ) else if( name == std::string(#TYPE).append(" __attribute__((ext_vector_type("#VECT")))") ) return VECT*SZ;
+#define TYPESIZEVEC(TYPE,SZ)\
+ else if(name == #TYPE) return SZ;\
+ TYPESIZE(TYPE,2,SZ)\
+ TYPESIZE(TYPE,3,SZ)\
+ TYPESIZE(TYPE,4,SZ)\
+ TYPESIZE(TYPE,8,SZ)\
+ TYPESIZE(TYPE,16,SZ)
+
+ static uint32_t getTypeSize(Module* M, const ir::Unit &unit, std::string& name) {
+ if(name == "size_t") return sizeof(size_t);
+ TYPESIZEVEC(char,1)
+ TYPESIZEVEC(unsigned char,1)
+ TYPESIZEVEC(short,2)
+ TYPESIZEVEC(unsigned short,2)
+ TYPESIZEVEC(half,2)
+ TYPESIZEVEC(int,4)
+ TYPESIZEVEC(unsigned int,4)
+ TYPESIZEVEC(float,4)
+ TYPESIZEVEC(double,8)
+ TYPESIZEVEC(long,8)
+ TYPESIZEVEC(unsigned long,8)
+ else{
+ StructType *StrTy = M->getTypeByName("struct."+name);
+ if(StrTy)
+ return getTypeByteSize(unit,StrTy);
+ }
+ GBE_ASSERTM(false, "Unspported type name");
+ return 0;
+ }
+#undef TYPESIZEVEC
+#undef TYPESIZE
/*! Handle the LLVM IR Value to Gen IR register translation. This has 2 roles:
* - Split the LLVM vector into several scalar values
* - Handle the transparent copies (bitcast or use of intrincics functions
@@ -553,7 +586,7 @@ namespace gbe
virtual bool doInitialization(Module &M);
/*! helper function for parsing global constant data */
- void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
+ void getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &) const;
void collectGlobalConstant(void) const;
ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u);
const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u);
@@ -689,6 +722,8 @@ namespace gbe
DECL_VISIT_FN(BranchInst, BranchInst);
DECL_VISIT_FN(PHINode, PHINode);
DECL_VISIT_FN(AllocaInst, AllocaInst);
+ DECL_VISIT_FN(AtomicRMWInst, AtomicRMWInst);
+ DECL_VISIT_FN(AtomicCmpXchgInst, AtomicCmpXchgInst);
#undef DECL_VISIT_FN
// Emit unary instructions from gen native function
@@ -700,8 +735,8 @@ namespace gbe
// Emit subgroup instructions
void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
// Emit subgroup instructions
- void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
- void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
+ void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
+ void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
uint8_t appendSampler(CallSite::arg_iterator AI);
uint8_t getImageID(CallInst &I);
@@ -735,6 +770,7 @@ namespace gbe
return NULL;
return unit.printfs[inst];
}
+ void emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple);
private:
void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug infomation in context for subsequently passing to Gen insn
ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -894,7 +930,8 @@ namespace gbe
pointerOrigMap.insert(std::make_pair(work, pointers));
} else {
// update the pointer source here,
- (*ptrIter).second[0] = ptr;
+ if ((!isa<SelectInst>(work) && !isa<PHINode>(work)))
+ (*ptrIter).second[0] = ptr;
}
continue;
@@ -940,7 +977,8 @@ namespace gbe
pointerOrigMap.insert(std::make_pair(pointer, pointers));
} else {
// update the pointer source here,
- (*ptrIter).second[0] = ptr;
+ if ((!isa<SelectInst>(pointer) && !isa<PHINode>(pointer)))
+ (*ptrIter).second[0] = ptr;
}
} else {
workList.push_back(theUser);
@@ -1188,8 +1226,13 @@ namespace gbe
break;
}
case 2:
- new_bti = BTI_CONSTANT;
-
+ // ocl 2.0, constant pointer use separate bti
+ if(legacyMode)
+ new_bti = BTI_CONSTANT;//btiBase;
+ else {
+ new_bti = btiBase;//btiBase;
+ incBtiBase();
+ }
break;
case 3:
new_bti = BTI_LOCAL;
@@ -1230,9 +1273,11 @@ namespace gbe
}
MDNode *typeNameNode = NULL;
MDNode *typeBaseNameNode = NULL;
+ MDNode *typeQualNode = NULL;
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
typeNameNode = F.getMetadata("kernel_arg_type");
typeBaseNameNode = F.getMetadata("kernel_arg_base_type");
+ typeQualNode = F.getMetadata("kernel_arg_type_qual");
#else
MDNode *node = getKernelFunctionMetadata(&F);
for(uint j = 0;node && j < node->getNumOperands() - 1; j++) {
@@ -1242,6 +1287,8 @@ namespace gbe
if (!attrName) continue;
if (attrName->getString() == "kernel_arg_type") {
typeNameNode = attrNode;
+ } else if (attrName->getString() == "kernel_arg_type_qual") {
+ typeQualNode = attrNode;
}
if (attrName->getString() == "kernel_arg_base_type") {
typeBaseNameNode = attrNode;
@@ -1263,9 +1310,12 @@ namespace gbe
if(typeBaseNameNode) {
llvmInfo.typeBaseName= (cast<MDString>(typeBaseNameNode->getOperand(opID)))->getString();
}
+ llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(opID)))->getString();
+ llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(opID)))->getString();
bool isImage = llvmInfo.isImageType();
- if (I->getType()->isPointerTy() || isImage) {
- BtiMap.insert(std::make_pair(&*I, getNewBti(&*I, isImage)));
+ bool isPipe = llvmInfo.isPipeType();
+ if (I->getType()->isPointerTy() || isImage || isPipe) {
+ BtiMap.insert(std::make_pair(&*I, getNewBti(&*I, isImage || isPipe)));
}
}
@@ -1319,12 +1369,12 @@ namespace gbe
}
Builder.SetInsertPoint(cast<Instruction>(theUser));
- Type *int32Ty = Type::getInt32Ty(ptr->getContext());
- Value *v1 = Builder.CreatePtrToInt(pointerOp, int32Ty);
+ Type *ptyTy = IntegerType::get(ptr->getContext(), getTypeBitSize(unit, ptr->getType()));
+ Value *v1 = Builder.CreatePtrToInt(pointerOp, ptyTy);
- Value *v2 = Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), int32Ty);
- Value *v3 = Builder.CreatePtrToInt(base, int32Ty);
- Value *v4 = Builder.CreatePtrToInt(bti, int32Ty);
+ Value *v2 = Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), ptyTy);
+ Value *v3 = Builder.CreatePtrToInt(base, ptyTy);
+ Value *v4 = Builder.CreatePtrToInt(bti, ptyTy);
// newLocBase = (pointer - origin) + base_start
Value *diff = Builder.CreateSub(v1, v2);
Value *newLocBase = Builder.CreateAdd(v3, diff);
@@ -1390,8 +1440,8 @@ namespace gbe
}
}
// storing/loading pointer would introduce revisit
- for (std::vector<Value *>::iterator iter = revisit.begin(); iter != revisit.end(); ++iter) {
- findPointerEscape(*iter, mixedPtr, true, revisit);
+ for (size_t i = 0; i < revisit.size(); ++i) {
+ findPointerEscape(revisit[i], mixedPtr, true, revisit);
}
// the second pass starts from mixed pointer
@@ -1445,22 +1495,61 @@ namespace gbe
return;
}
- void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+ void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &relocs) const {
Type * type = c->getType();
Type::TypeID id = type->getTypeID();
GBE_ASSERT(c);
+ if (isa<ConstantExpr>(c)) {
+ const ConstantExpr *expr = dyn_cast<ConstantExpr>(c);
+ Value *pointer = expr->getOperand(0);
+ if (expr->getOpcode() == Instruction::GetElementPtr) {
+ uint32_t constantOffset = 0;
+ CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+ for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
+ int32_t TypeIndex;
+ ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
+ GBE_ASSERTM(ConstOP != NULL, "must be constant index");
+ TypeIndex = ConstOP->getZExtValue();
+ GBE_ASSERT(TypeIndex >= 0);
+ constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
+ CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+ }
+
+ ir::Constant cc = unit.getConstantSet().getConstant(pointer->getName());
+ unsigned int defOffset = cc.getOffset();
+ relocs.push_back(ir::RelocEntry(offset, defOffset + constantOffset));
+
+ uint32_t size = getTypeByteSize(unit, type);
+ memset((char*)mem+offset, 0, size);
+ offset += size;
+ } else if (expr->isCast()) {
+ Constant *constPtr = cast<Constant>(pointer);
+ getConstantData(constPtr, mem, offset, relocs);
+ offset += getTypeByteSize(unit, type);
+ }
+ return;
+ }
+ if (isa<GlobalVariable>(c)) {
+ ir::Constant cc = unit.getConstantSet().getConstant(c->getName());
+ unsigned int defOffset = cc.getOffset();
+
+ relocs.push_back(ir::RelocEntry(offset, defOffset));
+ uint32_t size = getTypeByteSize(unit, type);
+ memset((char*)mem+offset, 0, size);
+ offset += size;
+ return;
+ }
if(isa<UndefValue>(c)) {
uint32_t size = getTypeByteSize(unit, type);
offset += size;
return;
- } else if(isa<ConstantAggregateZero>(c)) {
+ } else if(isa<ConstantAggregateZero>(c) || isa<ConstantPointerNull>(c)) {
uint32_t size = getTypeByteSize(unit, type);
memset((char*)mem+offset, 0, size);
offset += size;
return;
}
-
switch(id) {
case Type::TypeID::StructTyID:
{
@@ -1478,7 +1567,7 @@ namespace gbe
offset += padding/8;
const Constant* sub = cast<Constant>(c->getOperand(op));
GBE_ASSERT(sub);
- getConstantData(sub, mem, offset);
+ getConstantData(sub, mem, offset, relocs);
}
break;
}
@@ -1499,7 +1588,7 @@ namespace gbe
uint32_t ops = c->getNumOperands();
for(uint32_t op = 0; op < ops; ++op) {
Constant * ca = dyn_cast<Constant>(c->getOperand(op));
- getConstantData(ca, mem, offset);
+ getConstantData(ca, mem, offset, relocs);
offset += padding;
}
}
@@ -1538,30 +1627,75 @@ namespace gbe
offset += sizeof(double);
break;
}
+ case Type::TypeID::HalfTyID:
+ {
+ const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+ llvm::APFloat apf = cf->getValueAPF();
+ llvm::APInt api = apf.bitcastToAPInt();
+ uint64_t v64 = api.getZExtValue();
+ uint16_t v16 = static_cast<uint16_t>(v64);
+ *(unsigned short *)((char*)mem+offset) = v16;
+ offset += sizeof(short);
+ break;
+ }
+ case Type::TypeID::PointerTyID:
+ {
+ break;
+ }
default:
- NOT_IMPLEMENTED;
+ {
+ c->dump();
+ NOT_IMPLEMENTED;
+ }
}
}
-
+ static bool isProgramGlobal(const GlobalVariable &v) {
+ unsigned addrSpace = v.getType()->getAddressSpace();
+ // private/global/constant
+ return (addrSpace == 2 || addrSpace == 1 || addrSpace == 0);
+ }
void GenWriter::collectGlobalConstant(void) const {
const Module::GlobalListType &globalList = TheModule->getGlobalList();
+ // The first pass just create the global variable constants
for(auto i = globalList.begin(); i != globalList.end(); i ++) {
const GlobalVariable &v = *i;
- if(!v.isConstantUsed()) continue;
const char *name = v.getName().data();
- ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
- if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
- GBE_ASSERT(v.hasInitializer());
- const Constant *c = v.getInitializer();
- Type * type = c->getType();
+ vector<ir::RelocEntry> relocs;
+
+ if(isProgramGlobal(v)) {
+ Type * type = v.getType()->getPointerElementType();
uint32_t size = getTypeByteSize(unit, type);
- void* mem = malloc(size);
- uint32_t offset = 0;
- getConstantData(c, mem, offset);
uint32_t alignment = getAlignmentByte(unit, type);
- unit.newConstant((char *)mem, name, size, alignment);
- free(mem);
+ unit.newConstant(name, size, alignment);
+ }
+ }
+ // the second pass to initialize the data
+ for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+ const GlobalVariable &v = *i;
+ const char *name = v.getName().data();
+
+ if(isProgramGlobal(v)) {
+ if (v.hasInitializer()) {
+ vector<ir::RelocEntry> relocs;
+ uint32_t offset = 0;
+ ir::Constant &con = unit.getConstantSet().getConstant(name);
+ void* mem = malloc(con.getSize());
+ const Constant *c = v.getInitializer();
+ getConstantData(c, mem, offset, relocs);
+ unit.getConstantSet().setData((char*)mem, con.getOffset(), con.getSize());
+ free(mem);
+
+ if (!legacyMode) {
+ uint32_t refOffset = unit.getConstantSet().getConstant(name).getOffset();
+ for (uint32_t k = 0; k < relocs.size(); k++) {
+ unit.getRelocTable().addEntry(
+ refOffset + relocs[k].refOffset,
+ relocs[k].defOffset
+ );
+ }
+ }
+ }
}
}
}
@@ -1571,6 +1705,9 @@ namespace gbe
// Initialize
TheModule = &M;
+ uint32_t oclVersion = getModuleOclVersion(TheModule);
+ legacyMode = oclVersion >= 200 ? false : true;
+ unit.setOclVersion(oclVersion);
collectGlobalConstant();
return false;
}
@@ -1703,7 +1840,10 @@ namespace gbe
// NULL pointers
if(isa<ConstantPointerNull>(CPV)) {
- return ctx.newImmediate(uint32_t(0));
+ if (ctx.getPointerFamily() == ir::FAMILY_QWORD)
+ return ctx.newImmediate(uint64_t(0));
+ else
+ return ctx.newImmediate(uint32_t(0));
}
const Type::TypeID typeID = CPV->getType()->getTypeID();
@@ -2222,6 +2362,14 @@ namespace gbe
}
if(typeNameNode) {
llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(opID)))->getString();
+ //LLVM 3.9 image's type name include access qual, don't match OpenCL spec, erase them.
+ std::vector<std::string> filters = {"__read_only ", "__write_only "};
+ for (uint32_t i = 0; i < filters.size(); i++) {
+ size_t pos = llvmInfo.typeName.find(filters[i]);
+ if (pos != std::string::npos) {
+ llvmInfo.typeName = llvmInfo.typeName.erase(pos, filters[i].length());
+ }
+ }
}
if(typeBaseNameNode){
llvmInfo.typeBaseName = (cast<MDString>(typeBaseNameNode->getOperand(opID)))->getString();
@@ -2273,6 +2421,11 @@ namespace gbe
(void)ctx.getFunction().getSamplerSet()->append(reg, &ctx);
continue;
}
+ if(llvmInfo.isPipeType()) {
+ llvmInfo.typeSize = getTypeSize(F.getParent(),unit,llvmInfo.typeName);
+ ctx.input(argName, ir::FunctionArgument::PIPE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), BtiMap.find(&*I)->second);
+ continue;
+ }
if (type->isPointerTy() == false)
ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
@@ -2807,6 +2960,8 @@ namespace gbe
const Constant *c = v.getInitializer();
Type *ty = c->getType();
uint32_t oldSlm = f.getSLMSize();
+ // FIXME temporary reserve 4 bytes to avoid 0 address
+ if (oldSlm == 0) oldSlm = 4;
uint32_t align = 8 * getAlignmentByte(unit, ty);
uint32_t padding = getPadding(oldSlm*8, align);
@@ -2814,32 +2969,24 @@ namespace gbe
this->newRegister(const_cast<GlobalVariable*>(&v));
ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
- ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
- } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
- GBE_ASSERT(v.hasInitializer());
- this->newRegister(const_cast<GlobalVariable*>(&v));
- ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
- ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
- ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
- } else {
+ ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType())));
+ } else if(addrSpace == ir::MEM_CONSTANT
+ || addrSpace == ir::MEM_GLOBAL
+ || v.isConstant()) {
if(v.getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
ctx.getUnit().getProfilingInfo()->setBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
regTranslator.newScalarProxy(ir::ocl::profilingbptr, const_cast<GlobalVariable*>(&v));
- } else if(v.getName().str().substr(0, 4) == ".str") {
- /* When there are multi printf statements in multi kernel fucntions within the same
- translate unit, if they have the same sting parameter, such as
- kernel_func1 () {
- printf("Line is %d\n", line_num1);
- }
- kernel_func2 () {
- printf("Line is %d\n", line_num2);
- }
- The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
- So when translating the kernel_func1, we can not unref that global var, so we will
- get here. Just ignore it to avoid assert. */
} else {
- GBE_ASSERT(0 && "Unsupported private memory access pattern");
+ this->newRegister(const_cast<GlobalVariable*>(&v));
+ ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+ ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
+ ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
+ if (!legacyMode) {
+ ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, reg);
+ }
}
+ } else if(addrSpace == ir::MEM_PRIVATE) {
+ this->newRegister(const_cast<GlobalVariable*>(&v));
}
}
}
@@ -3331,7 +3478,10 @@ namespace gbe
case Instruction::FPTrunc:
case Instruction::Trunc:
this->newRegister(&I);
- break;
+ break;
+ case Instruction::AddrSpaceCast:
+ regTranslator.newValueProxy(srcValue, dstValue);
+ break;
default: NOT_SUPPORTED;
}
}
@@ -3339,6 +3489,8 @@ namespace gbe
void GenWriter::emitCastInst(CastInst &I) {
switch (I.getOpcode())
{
+ case Instruction::AddrSpaceCast:
+ break;
case Instruction::PtrToInt:
case Instruction::IntToPtr:
{
@@ -3628,6 +3780,7 @@ namespace gbe
this->newRegister(&I);
break;
case Intrinsic::ctlz:
+ case Intrinsic::cttz:
case Intrinsic::bswap:
this->newRegister(&I);
break;
@@ -3679,6 +3832,12 @@ namespace gbe
regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
case GEN_OCL_GET_LOCAL_SIZE2:
regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
+ case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE0:
+ regTranslator.newScalarProxy(ir::ocl::enqlsize0, dst); break;
+ case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE1:
+ regTranslator.newScalarProxy(ir::ocl::enqlsize1, dst); break;
+ case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE2:
+ regTranslator.newScalarProxy(ir::ocl::enqlsize2, dst); break;
case GEN_OCL_GET_GLOBAL_SIZE0:
regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
case GEN_OCL_GET_GLOBAL_SIZE1:
@@ -3741,7 +3900,7 @@ namespace gbe
case GEN_OCL_FORCE_SIMD16:
case GEN_OCL_LBARRIER:
case GEN_OCL_GBARRIER:
- case GEN_OCL_LGBARRIER:
+ case GEN_OCL_BARRIER:
ctx.getFunction().setUseSLM(true);
break;
case GEN_OCL_WRITE_IMAGE_I:
@@ -3827,6 +3986,7 @@ namespace gbe
case GEN_OCL_SIMD_SIZE:
case GEN_OCL_READ_TM:
case GEN_OCL_REGION:
+ case GEN_OCL_IN_PRIVATE:
case GEN_OCL_SIMD_ID:
case GEN_OCL_SIMD_SHUFFLE:
case GEN_OCL_VME:
@@ -3853,16 +4013,47 @@ namespace gbe
case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX:
case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
case GEN_OCL_LRP:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
+ case GEN_OCL_ENQUEUE_SET_NDRANGE_INFO:
+ case GEN_OCL_ENQUEUE_GET_NDRANGE_INFO:
this->newRegister(&I);
break;
+ case GEN_OCL_GET_PIPE:
+ {
+ Value *srcValue = I.getOperand(0);
+ if( BtiMap.find(dst) == BtiMap.end())
+ {
+ unsigned tranBti = BtiMap.find(srcValue)->second;
+ BtiMap.insert(std::make_pair(dst, tranBti));
+ }
+ regTranslator.newValueProxy(srcValue, dst);
+ break;
+ }
+ case GEN_OCL_MAKE_RID:
+ case GEN_OCL_GET_RID:
+ {
+ Value *srcValue = I.getOperand(0);
+ regTranslator.newValueProxy(srcValue, dst);
+ break;
+ }
+ case GEN_OCL_ENQUEUE_GET_ENQUEUE_INFO_ADDR:
+ regTranslator.newScalarProxy(ir::ocl::enqueuebufptr, dst);
+ break;
case GEN_OCL_PRINTF:
this->newRegister(&I); // fall through
case GEN_OCL_PUTS:
@@ -3877,14 +4068,22 @@ namespace gbe
case GEN_OCL_CALC_TIMESTAMP:
case GEN_OCL_STORE_PROFILING:
case GEN_OCL_DEBUGWAIT:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
break;
case GEN_OCL_NOT_FOUND:
default:
@@ -3904,6 +4103,107 @@ namespace gbe
ctx.ALU1(opcode, type, dst, src);
}
+ void GenWriter::regAllocateAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+ this->newRegister(&I);
+ }
+
+ void GenWriter::emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple) {
+ ir::Register pointer = this->getRegister(llvmPtr);
+ ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+ // Get the function arguments
+ ir::Register ptr;
+ ir::Register btiReg;
+ unsigned SurfaceIndex = 0xff;
+ ir::AddressMode AM;
+ if (legacyMode) {
+ Value *bti = getBtiRegister(llvmPtr);
+ Value *ptrBase = getPointerBase(llvmPtr);
+ ir::Register baseReg = this->getRegister(ptrBase);
+ if (isa<ConstantInt>(bti)) {
+ AM = ir::AM_StaticBti;
+ SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+ addrSpace = btiToGen(SurfaceIndex);
+ } else {
+ AM = ir::AM_DynamicBti;
+ addrSpace = ir::MEM_MIXED;
+ btiReg = this->getRegister(bti);
+ }
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+ ptr = ctx.reg(pointerFamily);
+ ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+ } else {
+ AM = ir::AM_Stateless;
+ ptr = pointer;
+ }
+
+ ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, SurfaceIndex);
+ }
+
+ void GenWriter::emitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+ // Get the function arguments
+ Value *llvmPtr = I.getPointerOperand();
+ ir::AtomicOps opcode = ir::ATOMIC_OP_CMPXCHG;
+ uint32_t payloadNum = 0;
+ vector<ir::Register> payload;
+ const ir::Register oldValue = this->getRegister(&I, 0);
+ const ir::Register compareRet = this->getRegister(&I, 1);
+ const ir::Register expected = this->getRegister(I.getCompareOperand());
+
+ payload.push_back(this->getRegister(I.getCompareOperand()));
+ payloadNum++;
+ payload.push_back(this->getRegister(I.getNewValOperand()));
+ payloadNum++;
+ ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+ const ir::Tuple payloadTuple = payloadNum == 0 ?
+ ir::Tuple(0) :
+ ctx.arrayTuple(&payload[0], payloadNum);
+ this->emitAtomicInstHelper(opcode, type, oldValue, llvmPtr, payloadTuple);
+ ctx.EQ(type, compareRet, oldValue, expected);
+ }
+
+ void GenWriter::regAllocateAtomicRMWInst(AtomicRMWInst &I) {
+ this->newRegister(&I);
+ }
+
+ static INLINE ir::AtomicOps atomicOpsLLVMToGen(llvm::AtomicRMWInst::BinOp llvmOp) {
+ switch(llvmOp) {
+ case llvm::AtomicRMWInst::Xchg: return ir::ATOMIC_OP_XCHG;
+ case llvm::AtomicRMWInst::Add: return ir::ATOMIC_OP_ADD;
+ case llvm::AtomicRMWInst::Sub: return ir::ATOMIC_OP_SUB;
+ case llvm::AtomicRMWInst::And: return ir::ATOMIC_OP_AND;
+ case llvm::AtomicRMWInst::Or: return ir::ATOMIC_OP_OR;
+ case llvm::AtomicRMWInst::Xor: return ir::ATOMIC_OP_XOR;
+ case llvm::AtomicRMWInst::Max: return ir::ATOMIC_OP_IMAX;
+ case llvm::AtomicRMWInst::Min: return ir::ATOMIC_OP_IMIN;
+ case llvm::AtomicRMWInst::UMax: return ir::ATOMIC_OP_UMAX;
+ case llvm::AtomicRMWInst::UMin: return ir::ATOMIC_OP_UMIN;
+ case llvm::AtomicRMWInst::Nand:
+ case llvm::AtomicRMWInst::BAD_BINOP: break;
+ }
+ GBE_ASSERT(false);
+ return ir::ATOMIC_OP_INVALID;
+ }
+
+ void GenWriter::emitAtomicRMWInst(AtomicRMWInst &I) {
+ // Get the function arguments
+ llvm::AtomicRMWInst::BinOp llvmOpcode = I.getOperation();
+ Value *llvmPtr = I.getOperand(0);
+ ir::AtomicOps opcode = atomicOpsLLVMToGen(llvmOpcode);
+
+ const ir::Register dst = this->getRegister(&I);
+
+ uint32_t payloadNum = 0;
+ vector<ir::Register> payload;
+
+ payload.push_back(this->getRegister(I.getOperand(1)));
+ payloadNum++;
+ ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+ const ir::Tuple payloadTuple = payloadNum == 0 ?
+ ir::Tuple(0) :
+ ctx.arrayTuple(&payload[0], payloadNum);
+ this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple);
+ }
+
void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
@@ -4047,6 +4347,7 @@ namespace gbe
ctx.SUBGROUP(opcode, getRegister(&I), srcTuple, 1, ir::TYPE_S32);
} else if (opcode == ir::WORKGROUP_OP_BROADCAST) {
int argNum = CS.arg_size();
+ GBE_ASSERT(argNum == 2);
std::vector<ir::Register> src(argNum);
for (int i = 0; i < argNum; i++) {
src[i] = this->getRegister(*(AI++));
@@ -4076,7 +4377,7 @@ namespace gbe
GBE_ASSERT(AI == AE);
}
- void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+ void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
GBE_ASSERT(AI != AE);
@@ -4112,7 +4413,6 @@ namespace gbe
ptr = pointer;
}
- ir::Type type = ir::TYPE_U32;
GBE_ASSERT(AM != ir::AM_DynamicBti);
if(isWrite){
@@ -4133,7 +4433,7 @@ namespace gbe
GBE_ASSERT(AI == AE);
}
- void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+ void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
GBE_ASSERT(AI != AE);
@@ -4149,7 +4449,7 @@ namespace gbe
srcTupleData.push_back(getRegister(*(AI), i));
AI++;
const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size);
- ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size);
+ ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type);
} else {
ir::Register src[2];
src[0] = getRegister(*(AI++));
@@ -4159,7 +4459,7 @@ namespace gbe
dstTupleData.push_back(getRegister(&I, i));
const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
- ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2);
+ ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type);
}
GBE_ASSERT(AI == AE);
@@ -4307,6 +4607,56 @@ namespace gbe
}
}
break;
+ case Intrinsic::cttz:
+ {
+ Type *llvmDstType = I.getType();
+ ir::Type dstType = getType(ctx, llvmDstType);
+ Type *llvmSrcType = I.getOperand(0)->getType();
+ ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
+
+ //the llvm.ctlz.i64 is lowered to two llvm.cttz.i32 call in ocl_ctz.ll
+ GBE_ASSERT(srcType != ir::TYPE_U64);
+
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(I.getOperand(0));
+
+ uint32_t imm_value = 0;
+ if(srcType == ir::TYPE_U16) {
+ imm_value = 0xFFFF0000;
+ }else if(srcType == ir::TYPE_U8) {
+ imm_value = 0xFFFFFF00;
+ }
+ if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
+ ir::ImmediateIndex imm;
+ ir::Type tmpType = ir::TYPE_S32;
+ ir::Type revType = ir::TYPE_U32;
+ imm = ctx.newIntegerImmediate(imm_value, revType);
+ const ir::RegisterFamily family = getFamily(revType);
+ const ir::Register immReg = ctx.reg(family);
+ ctx.LOADI(ir::TYPE_U32, immReg, imm);
+
+ ir::Register tmp0 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp1 = ctx.reg(getFamily(revType));
+ ir::Register tmp2 = ctx.reg(getFamily(revType));
+ ir::Register revTmp = ctx.reg(getFamily(revType));
+
+ ctx.CVT(tmpType, srcType, tmp0, src);
+ //gen does not have 'tzd', so reverse first
+ ctx.ADD(revType, tmp1, tmp0, immReg);
+ ctx.ALU1(ir::OP_BFREV, revType, revTmp, tmp1);
+ ctx.ALU1(ir::OP_LZD, ir::TYPE_U32, tmp2, revTmp);
+ ctx.CVT(dstType, tmpType, dst, tmp2);
+ }
+ else
+ {
+ GBE_ASSERT(srcType == ir::TYPE_U32);
+ ir::Type revType = ir::TYPE_U32;
+ ir::Register revTmp = ctx.reg(getFamily(revType));
+ ctx.ALU1(ir::OP_BFREV, revType, revTmp, src);
+ ctx.ALU1(ir::OP_LZD, ir::TYPE_U32, dst, revTmp);
+ }
+ }
+ break;
case Intrinsic::fma:
case Intrinsic::fmuladd:
{
@@ -4433,7 +4783,26 @@ namespace gbe
ctx.VME(imageID, dstTuple, srcTuple, dst_length, src_length,
msg_type, vme_search_path_lut_x.getIntegerValue(),
lut_sub_x.getIntegerValue());
-
+ break;
+ }
+ case GEN_OCL_IN_PRIVATE:
+ {
+ const ir::Register dst = this->getRegister(&I);
+ uint32_t stackSize = ctx.getFunction().getStackSize();
+ if (stackSize == 0) {
+ ir::ImmediateIndex imm = ctx.newImmediate((bool)0);
+ ctx.LOADI(ir::TYPE_BOOL, dst, imm);
+ } else {
+ ir::Register cmp0 = ctx.reg(ir::FAMILY_BOOL);
+ ir::Register cmp1 = ctx.reg(ir::FAMILY_BOOL);
+ const ir::Register src0 = this->getRegister(*AI);
+ ir::Register tmp = ctx.reg(ir::FAMILY_QWORD);
+
+ ctx.GE(ir::TYPE_U64, cmp0, src0, ir::ocl::stackbuffer);
+ ctx.ADD(ir::TYPE_U64, tmp, ir::ocl::stackbuffer, ir::ocl::stacksize);
+ ctx.LT(ir::TYPE_U64, cmp1, src0, tmp);
+ ctx.AND(ir::TYPE_BOOL, dst, cmp0, cmp1);
+ }
break;
}
case GEN_OCL_REGION:
@@ -4456,7 +4825,31 @@ namespace gbe
case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
- case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+ case GEN_OCL_BARRIER:
+ {
+ Constant *CPV = dyn_cast<Constant>(*AI);
+ unsigned syncFlag = 0;
+ if (CPV) {
+ const ir::Immediate &x = processConstantImm(CPV);
+ unsigned barrierArg = x.getIntegerValue();
+ if (barrierArg & 0x1) {
+ syncFlag |= ir::syncLocalBarrier;
+ }
+ if (barrierArg & 0x2) {
+ syncFlag |= ir::syncGlobalBarrier;
+ }
+ if (barrierArg & 0x4) {
+ syncFlag |= ir::syncImageBarrier;
+ }
+ } else {
+ // FIXME we default it to do global fence and barrier.
+ // we need to do runtime check here.
+ syncFlag = ir::syncLocalBarrier | ir::syncGlobalBarrier;
+ }
+
+ ctx.SYNC(syncFlag);
+ break;
+ }
case GEN_OCL_ATOMIC_ADD0:
case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
case GEN_OCL_ATOMIC_SUB0:
@@ -4529,6 +4922,7 @@ namespace gbe
bool isFloatCoord = coordType == ir::TYPE_FLOAT;
bool requiredFloatCoord = samplerOffset == 0;
+ (void) isFloatCoord;
GBE_ASSERT(isFloatCoord == requiredFloatCoord);
vector<ir::Register> dstTupleData, srcTupleData;
@@ -4904,6 +5298,7 @@ namespace gbe
Value *bti = getBtiRegister(llvmPtr);
GBE_ASSERT(isa<ConstantInt>(bti)); //Should never be mixed pointer.
uint32_t index = cast<ConstantInt>(bti)->getZExtValue();
+ (void) index;
GBE_ASSERT(btiToGen(index) == ir::MEM_GLOBAL);
++AI;
GBE_ASSERT(AI != AE);
@@ -4992,38 +5387,99 @@ namespace gbe
ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
break;
}
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM:
this->emitBlockReadWriteMemInst(I, CS, false, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
this->emitBlockReadWriteMemInst(I, CS, false, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
this->emitBlockReadWriteMemInst(I, CS, false, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
this->emitBlockReadWriteMemInst(I, CS, false, 8); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
this->emitBlockReadWriteMemInst(I, CS, true, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
this->emitBlockReadWriteMemInst(I, CS, true, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
this->emitBlockReadWriteMemInst(I, CS, true, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
this->emitBlockReadWriteMemInst(I, CS, true, 8); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE:
this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
this->emitBlockReadWriteImageInst(I, CS, false, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
this->emitBlockReadWriteImageInst(I, CS, false, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
this->emitBlockReadWriteImageInst(I, CS, false, 8); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
this->emitBlockReadWriteImageInst(I, CS, true, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
this->emitBlockReadWriteImageInst(I, CS, true, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM:
+ this->emitBlockReadWriteMemInst(I, CS, false, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+ this->emitBlockReadWriteMemInst(I, CS, false, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+ this->emitBlockReadWriteMemInst(I, CS, false, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+ this->emitBlockReadWriteMemInst(I, CS, false, 8, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+ this->emitBlockReadWriteMemInst(I, CS, true, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+ this->emitBlockReadWriteMemInst(I, CS, true, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+ this->emitBlockReadWriteMemInst(I, CS, true, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
+ this->emitBlockReadWriteMemInst(I, CS, true, 8, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE:
+ this->emitBlockReadWriteImageInst(I, CS, false, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+ this->emitBlockReadWriteImageInst(I, CS, false, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+ this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
+ this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+ this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+ this->emitBlockReadWriteImageInst(I, CS, true, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+ this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
+ this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U16); break;
+ case GEN_OCL_GET_PIPE:
+ case GEN_OCL_MAKE_RID:
+ case GEN_OCL_GET_RID:
+ {
+ break;
+ }
+ case GEN_OCL_ENQUEUE_SET_NDRANGE_INFO:
+ {
+ GBE_ASSERT(AI != AE);
+ Value *srcValue = *AI;
+ ++AI;
+ Value *dstValue = &I;
+ regTranslator.newValueProxy(srcValue, dstValue);
+ break;
+ }
+ case GEN_OCL_ENQUEUE_GET_NDRANGE_INFO:
+ {
+ GBE_ASSERT(AI != AE);
+ Value *srcValue = *AI;
+ ++AI;
+ Value *dstValue = &I;
+ regTranslator.newValueProxy(srcValue, dstValue);
+ break;
+ }
+ case GEN_OCL_ENQUEUE_GET_ENQUEUE_INFO_ADDR:
+ {
+ ctx.getFunction().setUseDeviceEnqueue(true);
+ break;
+ }
default: break;
}
}
@@ -5073,15 +5529,23 @@ namespace gbe
uint32_t prevStackPtr = ctx.getFunction().getStackSize();
uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
if (step != 0) {
- ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+ ir::ImmediateIndex stepImm;
+ ir::Type pointerTy = getType(pointerFamily);
+ if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+ stepImm = ctx.newImmediate(uint32_t(step));
+ else
+ stepImm = ctx.newImmediate(uint64_t(step));
ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
- ctx.LOADI(ir::TYPE_U32, stepReg, stepImm);
- ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+ ctx.LOADI(pointerTy, stepReg, stepImm);
+ ctx.ADD(pointerTy, stack, stack, stepReg);
ctx.getFunction().pushStackSize(step);
}
}
// Set the destination register properly
- ctx.MOV(imm.getType(), dst, stack);
+ if (legacyMode)
+ ctx.MOV(imm.getType(), dst, stack);
+ else
+ ctx.ADD(imm.getType(), dst, stack, ir::ocl::stackbuffer);
ctx.LOADI(imm.getType(), reg, immIndex);
ctx.ADD(imm.getType(), stack, stack, reg);
@@ -5249,7 +5713,7 @@ namespace gbe
// but later ArgumentLower pass need to match exact load/addImm pattern
// so, I avoid subtracting zero base to satisfy ArgumentLower pass.
if (!zeroBase)
- ctx.SUB(ir::TYPE_U32, mPtr, pointer, baseReg);
+ ctx.SUB(getType(ctx, llvmPtr->getType()), mPtr, pointer, baseReg);
else
mPtr = pointer;
} else {
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index f2a278e..1ab77c9 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -153,7 +153,12 @@ namespace gbe
llvm::FunctionPass* createSamplerFixPass();
/*! Add all the function call of ocl to our bitcode. */
- llvm::Module* runBitCodeLinker(llvm::Module *mod, bool strictMath);
+ llvm::Module* runBitCodeLinker(llvm::Module *mod, bool strictMath, ir::Unit &unit);
+
+ /*! Get the moudule's opencl version form meta data. */
+ uint32_t getModuleOclVersion(const llvm::Module *M);
+
+ void collectDeviceEnqueueInfo(llvm::Module *mod, ir::Unit &unit);
void* getPrintfInfo(llvm::CallInst* inst);
} /* namespace gbe */
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 48a72d1..86485da 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -10,6 +10,9 @@ DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS2, __gen_ocl_get_num_groups2)
DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE0, __gen_ocl_get_local_size0)
DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE1, __gen_ocl_get_local_size1)
DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE2, __gen_ocl_get_local_size2)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE0, __gen_ocl_get_enqueued_local_size0)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE1, __gen_ocl_get_enqueued_local_size1)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE2, __gen_ocl_get_enqueued_local_size2)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE0, __gen_ocl_get_global_size0)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE1, __gen_ocl_get_global_size1)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
@@ -27,7 +30,7 @@ DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
// Barrier function
DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global)
-DECL_LLVM_GEN_FUNCTION(LGBARRIER, __gen_ocl_barrier_local_and_global)
+DECL_LLVM_GEN_FUNCTION(BARRIER, __gen_ocl_barrier)
// To force SIMD8/16 compilation
DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8, __gen_ocl_force_simd8)
@@ -169,6 +172,7 @@ DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
+DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private)
DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme)
@@ -217,22 +221,48 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_in
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, __gen_ocl_sub_group_block_read_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, __gen_ocl_sub_group_block_read_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, __gen_ocl_sub_group_block_read_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, __gen_ocl_sub_group_block_write_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, __gen_ocl_sub_group_block_write_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, __gen_ocl_sub_group_block_write_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, __gen_ocl_sub_group_block_read_image8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, __gen_ocl_sub_group_block_write_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_write_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8)
-
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM, __gen_ocl_sub_group_block_read_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM2, __gen_ocl_sub_group_block_read_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM4, __gen_ocl_sub_group_block_read_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM8, __gen_ocl_sub_group_block_read_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM, __gen_ocl_sub_group_block_write_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM2, __gen_ocl_sub_group_block_write_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM4, __gen_ocl_sub_group_block_write_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM8, __gen_ocl_sub_group_block_write_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE, __gen_ocl_sub_group_block_read_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE2, __gen_ocl_sub_group_block_read_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE4, __gen_ocl_sub_group_block_read_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE8, __gen_ocl_sub_group_block_read_ui_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE, __gen_ocl_sub_group_block_write_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE2, __gen_ocl_sub_group_block_write_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE4, __gen_ocl_sub_group_block_write_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE8, __gen_ocl_sub_group_block_write_ui_image8)
+
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM, __gen_ocl_sub_group_block_read_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM2, __gen_ocl_sub_group_block_read_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM4, __gen_ocl_sub_group_block_read_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM8, __gen_ocl_sub_group_block_read_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM, __gen_ocl_sub_group_block_write_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM2, __gen_ocl_sub_group_block_write_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM4, __gen_ocl_sub_group_block_write_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM8, __gen_ocl_sub_group_block_write_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE, __gen_ocl_sub_group_block_read_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE2, __gen_ocl_sub_group_block_read_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE4, __gen_ocl_sub_group_block_read_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE8, __gen_ocl_sub_group_block_read_us_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block_write_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8)
// common function
DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
+
+// pipe function
+DECL_LLVM_GEN_FUNCTION(GET_PIPE, __gen_ocl_get_pipe)
+DECL_LLVM_GEN_FUNCTION(GET_RID, __gen_ocl_get_rid)
+DECL_LLVM_GEN_FUNCTION(MAKE_RID, __gen_ocl_make_rid)
+
+//Enqueue function
+DECL_LLVM_GEN_FUNCTION(ENQUEUE_SET_NDRANGE_INFO, __gen_ocl_set_ndrange_info)
+DECL_LLVM_GEN_FUNCTION(ENQUEUE_GET_NDRANGE_INFO, __gen_ocl_get_ndrange_info)
+DECL_LLVM_GEN_FUNCTION(ENQUEUE_GET_ENQUEUE_INFO_ADDR, __gen_ocl_get_enqueue_info_addr)
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index c26e96a..f01bb51 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -54,6 +54,8 @@ namespace gbe {
return 'c';
case 3:
return 'l';
+ case 4:
+ return 'n';
default:
assert(0 && "Non support address space");
return '\0';
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 02dd4bf..367a2c3 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -65,6 +65,27 @@ namespace gbe
return bKernel;
}
+ uint32_t getModuleOclVersion(const llvm::Module *M) {
+ uint32_t oclVersion = 120;
+ NamedMDNode *version = M->getNamedMetadata("opencl.ocl.version");
+ if (version == NULL)
+ return oclVersion;
+ uint32_t ops = version->getNumOperands();
+ if(ops > 0) {
+ uint32_t major = 0, minor = 0;
+ MDNode* node = version->getOperand(0);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+ major = mdconst::extract<ConstantInt>(node->getOperand(0))->getZExtValue();
+ minor = mdconst::extract<ConstantInt>(node->getOperand(1))->getZExtValue();
+#else
+ major = cast<ConstantInt>(node->getOperand(0))->getZExtValue();
+ minor = cast<ConstantInt>(node->getOperand(1))->getZExtValue();
+#endif
+ oclVersion = major * 100 + minor * 10;
+ }
+ return oclVersion;
+ }
+
int32_t getPadding(int32_t offset, int32_t align) {
return (align - (offset % align)) % align;
}
@@ -262,7 +283,7 @@ namespace gbe
if(!operand)
continue;
-
+#if 0
//HACK TODO: Inserted by type replacement.. this code could break something????
if(getTypeByteSize(unit, operand->getType())>4)
{
@@ -286,7 +307,7 @@ namespace gbe
"", GEPInst);
}
}
-
+#endif
Value* tmpMul = operand;
if (size != 1) {
tmpMul = BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 615fb50..8850abb 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -682,10 +682,14 @@ namespace gbe {
*CI = InsertToVector(call, *CI);
break;
}
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
{
++CI;
++CI;
@@ -693,22 +697,32 @@ namespace gbe {
*CI = InsertToVector(call, *CI);
break;
}
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
{
if ((*CI)->getType()->isVectorTy())
*CI = InsertToVector(call, *CI);
break;
}
case GEN_OCL_VME:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
setAppendPoint(call);
extractFromVector(call);
break;
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index e108810..bef4df1 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -136,6 +136,9 @@ namespace gbe
MPM.add(createBasicAliasAnalysisPass());
#endif
MPM.add(createIntrinsicLoweringPass());
+ MPM.add(createBarrierNodupPass(false)); // remove noduplicate fnAttr before inlining.
+ MPM.add(createFunctionInliningPass(20000));
+ MPM.add(createBarrierNodupPass(true)); // restore noduplicate fnAttr after inlining.
MPM.add(createStripAttributesPass()); // Strip unsupported attributes and calling conventions.
MPM.add(createSamplerFixPass());
MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
@@ -146,9 +149,6 @@ namespace gbe
MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
MPM.add(createPruneEHPass()); // Remove dead EH info
- MPM.add(createBarrierNodupPass(false)); // remove noduplicate fnAttr before inlining.
- MPM.add(createFunctionInliningPass(20000));
- MPM.add(createBarrierNodupPass(true)); // restore noduplicate fnAttr after inlining.
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
MPM.add(createPostOrderFunctionAttrsLegacyPass());
#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
@@ -318,11 +318,23 @@ namespace gbe
if (!cl_mod) return false;
OUTPUT_BITCODE(BEFORE_LINK, (*cl_mod));
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+ legacy::PassManager passes__;
+#else
+ PassManager passes__;
+#endif
+ //run ExpandConstantExprPass before collectDeviceEnqueueInfo
+ //to simplify the analyze of block.
+ passes__.add(createExpandConstantExprPass()); // constant prop may generate ConstantExpr
+ passes__.run(*cl_mod);
+ /* Must call before materialize when link */
+ collectDeviceEnqueueInfo(cl_mod, unit);
std::unique_ptr<Module> M;
- /* Before do any thing, we first filter in all CL functions in bitcode. */
- M.reset(runBitCodeLinker(cl_mod, strictMath));
+ /* Before do any thing, we first filter in all CL functions in bitcode. */
+ /* Also set unit's pointer size in runBitCodeLinker */
+ M.reset(runBitCodeLinker(cl_mod, strictMath, unit));
if (!module)
delete cl_mod;
if (M.get() == 0)
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
index 8a492d6..e24dc4f 100644
--- a/backend/src/llvm/llvm_unroll.cpp
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -103,13 +103,11 @@ namespace gbe {
}
void setUnrollID(Loop *L, bool enable) {
- if (!enable && disabledLoops.find(L) != disabledLoops.end())
- return;
+ assert(enable);
LLVMContext &Context = L->getHeader()->getContext();
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
SmallVector<Metadata *, 2> forceUnroll;
forceUnroll.push_back(MDString::get(Context, "llvm.loop.unroll.enable"));
- forceUnroll.push_back(ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), enable)));
MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
SmallVector<Metadata *, 4> Vals;
Vals.push_back(NULL);
@@ -127,8 +125,6 @@ namespace gbe {
// Set operand 0 to refer to the loop id itself.
NewLoopID->replaceOperandWith(0, NewLoopID);
L->setLoopID(NewLoopID);
- if (!enable)
- disabledLoops.insert(L);
}
static bool hasPrivateLoadStore(Loop *L) {
@@ -190,7 +186,8 @@ namespace gbe {
if (currTripCount > 32) {
shouldUnroll = false;
- setUnrollID(currL, false);
+ //Don't change the unrollID if doesn't force unroll.
+ //setUnrollID(currL, false);
return shouldUnroll;
}
@@ -206,7 +203,8 @@ namespace gbe {
parentTripCount = SE->getSmallConstantTripCount(parentL, parentExitBlock);
}
if (parentTripCount != 0 && currTripCount * parentTripCount > 32) {
- setUnrollID(parentL, false);
+ //Don't change the unrollID if doesn't force unroll.
+ //setUnrollID(parentL, false);
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
loopInfo.markAsRemoved(parentL);
#else
@@ -243,8 +241,6 @@ namespace gbe {
virtual const char *getPassName() const {
return "SPIR backend: custom loop unrolling pass";
}
- private:
- std::set<Loop *> disabledLoops;
};
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index 42e6cc4..b9b5c6f 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -5,7 +5,7 @@
//
// Common defines for Image intrinsics
// Channel order
-#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB || color == CLK_sRGBA || color == CLK_sBGRA)
enum {
CLK_R = 0x10B0,
CLK_A = 0x10B1,
@@ -29,6 +29,11 @@ enum {
CLK_RGx = 0x10BB,
CLK_RGBx = 0x10BC
#endif
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_2_0)
+ ,
+ CLK_sRGBA = 0x10C1,
+ CLK_sBGRA = 0x10C2
+#endif
};
@@ -118,8 +123,4 @@ typedef enum clk_sampler_type {
} clk_sampler_type;
-// Memory synchronization
-#define CLK_LOCAL_MEM_FENCE (1 << 0)
-#define CLK_GLOBAL_MEM_FENCE (1 << 1)
-
#endif /* __OCL_COMMON_DEFINES__ */
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 64d33dc..5c62b4c 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -158,6 +158,18 @@ Supported Targets
* 6th Generation Intel Core Processors "Skylake" and "Kabylake".
* 5th Generation Intel Atom Processors "Broxten" or "Apollolake".
+OpenCL 2.0
+----------
+From release v1.3.0, beignet support OpenCL 2.0. By default, OpenCL 2.0 support is disabled, you can enable it when cmake with option
+-DENABLE_OPENCL_20=1. Please remember that to enable OpenCL 2.0, there are some dependencies. First, OpenCL 2.0 only support the targets
+from Skylake, include Skylake, Kabylake and Apollolake. Then, clang supports all OpenCL 2.0 feature from 3.9. So to enable OpenCL 2.0,
+you must update LLVM/clang to 3.9 or later. And also requeires libdrm at least 2.4.66.
+After enable OpenCL 2.0, beignet complies with OpenCL 2.0 spec, but some OpenCL 2.0 features are simulated by software, there is no performance
+gain, such as pipe and device queues, especially device queues.
+If you build beignet with OpenCL 2.0 enabled and your kernel don't use the OpenCL 2.0 features, please pass a build option -cl-std=CL1.2 for
+performance, the OpenCL 2.0 uses more registers and has lots of int64 operations, which may hurt performance, and beignet will continue to improve
+OpenCL 2.0 performance.
+
Known Issues
------------
@@ -273,6 +285,7 @@ Documents for OpenCL application developers
- [[V4l2 Buffer Sharing|Beignet/howto/v4l2-buffer-sharing-howto]]
- [[Video Motion Estimation|Beignet/howto/video-motion-estimation-howto]]
- [[Stand Alone Unit Test|Beignet/howto/stand-alone-utest-howto]]
+- [[Android build|Beignet/android-build-howto]]
The wiki URL is as below:
[http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index 2ef0a89..465f38b 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,5 +1,8 @@
# News
+## Jan 20, 2017
+[Beignet 1.3.0](https://01.org/beignet/downloads/beignet-1.3.0-2017-01-20) is released. This is a major release. Please see the release notes for more information.
+
## Nov 4, 2016
[Beignet 1.2.1](https://01.org/beignet/downloads/beignet-1.2.1-2016-11-04) is released. This is a bug-fix release.
diff --git a/docs/howto/android-build-howto.mdwn b/docs/howto/android-build-howto.mdwn
new file mode 100644
index 0000000..be4d6a0
--- /dev/null
+++ b/docs/howto/android-build-howto.mdwn
@@ -0,0 +1,64 @@
+Android build HowTo
+===================
+
+Beignet supports android system, but there are several android versions, Beignet
+only provide the android 5.1.1 (lollipop)'s build files. This document describes
+how to build and install the Beignet driver and utests to android devices.
+Please notice that, Beignet disable PCH in the android, so the kernel's compiling is very
+slow, we recommend used cross-compiler and OpenCL's binary program in android.
+
+Prerequisite
+-------------
+You must have the android's source code and have built the android system successfully.
+
+Build Beignet
+-------------
+
+Beignet is just similar as other android native libraries, the steps are as below.
+
+- Add the Beignet to android source code tree.
+ Put Beignet's source code to the appropriate directory under android source code,
+ such as vendor/intel/ufo or external.
+
+- Set up android's environment and choose a target
+ Run `source build/envsetup.sh` and `lunch`.
+
+- Build clang target library.
+ Android only build clang for host, Beignet depends on target libclang.so. Beignet provide
+ lollipop's patch for clang (version 3.5) to build target libclang.so, you can download the patch
+ from [this patch](https://01.org/sites/default/files/enable-clang-device-build-for-beignet.patch).
+ Change directory to external/clang, run `git apply` to apply the patch, and run `mm` to build
+ libclang.so.
+
+- Change to Beignet's directory, and make.
+ Run `mm -B`, if failed first time, run `mm -B` again. When finish, the Beignet libraries and
+ utests binary would be generated in the out/target/product/<target>/system/
+
+Install Beignet to target device
+--------------------------------
+To install Beignet, you need `adb push` all Beignet and clang libraries to the device's correspondent
+directory, include following files:
+out/target/product/<target>/system/lib64/libclang.so to /system/lib64/
+out/target/product/<target>/system/lib64/libcl.so to /system/lib64/
+out/target/product/<target>/system/lib64/libgbeinterp.so to /system/lib64/
+out/target/product/<target>/system/lib64/libgbe.so to /system/lib64/
+out/target/product/<target>/system/lib/libclang.so to /system/lib/
+out/target/product/<target>/system/lib/libcl.so to /system/lib/
+out/target/product/<target>/system/lib/libgbeinterp.so to /system/lib/
+out/target/product/<target>/system/lib/libgbe.so to /system/lib/
+
+Additionally, the bitcode file and ocl head file also need copy to device, as following:
+out/target/product/<target>/gen/SHARED_LIBRARIES/libgbe_intermediates/beignet.bc to /system/lib/ocl/
+out/target/product/<target>/gen/SHARED_LIBRARIES/libgbe_intermediates/libocl/include/ to /system/lib/ocl/include/
+
+If your application is linked to libOpenCL.so, you also need create a soft link `ln -s libcl.so libOpenCL.so`.
+
+If you want to run utests in the device, you all need copy the utests files:
+out/target/product/<target>/system/bin/utest_run-x86 to /system/bin/utest_run-x86
+out/target/product/<target>/system/bin/utest_run-x86_64 to /system/bin/utest_run-x86_64
+out/target/product/<target>/system/lib64/libutests.so to /system/lib64/
+out/target/product/<target>/system/lib/libcl.so to /system/lib/
+
+You also need copy the utests' kernels to the device:
+<Beignet path>/kernels/ to /system/lib/ocl/
+and set the environment variable "OCL_KERNEL_PATH=/system/lib/ocl/kernels/"before run utests.
diff --git a/include/CL/cl.h b/include/CL/cl.h
index 316565d..116f5d5 100644
--- a/include/CL/cl.h
+++ b/include/CL/cl.h
@@ -1,5 +1,5 @@
/*******************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -55,16 +60,19 @@ typedef cl_bitfield cl_device_fp_config;
typedef cl_uint cl_device_mem_cache_type;
typedef cl_uint cl_device_local_mem_type;
typedef cl_bitfield cl_device_exec_capabilities;
+typedef cl_bitfield cl_device_svm_capabilities;
typedef cl_bitfield cl_command_queue_properties;
typedef intptr_t cl_device_partition_property;
typedef cl_bitfield cl_device_affinity_domain;
typedef intptr_t cl_context_properties;
typedef cl_uint cl_context_info;
+typedef cl_bitfield cl_queue_properties;
typedef cl_uint cl_command_queue_info;
typedef cl_uint cl_channel_order;
typedef cl_uint cl_channel_type;
typedef cl_bitfield cl_mem_flags;
+typedef cl_bitfield cl_svm_mem_flags;
typedef cl_uint cl_mem_object_type;
typedef cl_uint cl_mem_info;
typedef cl_bitfield cl_mem_migration_flags;
@@ -74,6 +82,8 @@ typedef cl_uint cl_addressing_mode;
typedef cl_uint cl_filter_mode;
typedef cl_uint cl_sampler_info;
typedef cl_bitfield cl_map_flags;
+typedef intptr_t cl_pipe_properties;
+typedef cl_uint cl_pipe_info;
typedef cl_uint cl_program_info;
typedef cl_uint cl_program_build_info;
typedef cl_uint cl_program_binary_type;
@@ -87,7 +97,8 @@ typedef cl_uint cl_kernel_work_group_info;
typedef cl_uint cl_event_info;
typedef cl_uint cl_command_type;
typedef cl_uint cl_profiling_info;
-
+typedef cl_bitfield cl_sampler_properties;
+typedef cl_uint cl_kernel_exec_info;
typedef struct _cl_image_format {
cl_channel_order image_channel_order;
@@ -104,7 +115,13 @@ typedef struct _cl_image_desc {
size_t image_slice_pitch;
cl_uint num_mip_levels;
cl_uint num_samples;
- cl_mem buffer;
+#ifdef __GNUC__
+ __extension__ /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+ union {
+ cl_mem buffer;
+ cl_mem mem_object;
+ };
} cl_image_desc;
typedef struct _cl_buffer_region {
@@ -176,11 +193,14 @@ typedef struct _cl_buffer_region {
#define CL_INVALID_COMPILER_OPTIONS -66
#define CL_INVALID_LINKER_OPTIONS -67
#define CL_INVALID_DEVICE_PARTITION_COUNT -68
+#define CL_INVALID_PIPE_SIZE -69
+#define CL_INVALID_DEVICE_QUEUE -70
/* OpenCL Version */
#define CL_VERSION_1_0 1
#define CL_VERSION_1_1 1
#define CL_VERSION_1_2 1
+#define CL_VERSION_2_0 1
/* cl_bool */
#define CL_FALSE 0
@@ -204,82 +224,98 @@ typedef struct _cl_buffer_region {
#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
/* cl_device_info */
-#define CL_DEVICE_TYPE 0x1000
-#define CL_DEVICE_VENDOR_ID 0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
-#define CL_DEVICE_ADDRESS_BITS 0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT 0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
-#define CL_DEVICE_MAX_SAMPLERS 0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
-#define CL_DEVICE_ENDIAN_LITTLE 0x1026
-#define CL_DEVICE_AVAILABLE 0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
-#define CL_DEVICE_NAME 0x102B
-#define CL_DEVICE_VENDOR 0x102C
-#define CL_DRIVER_VERSION 0x102D
-#define CL_DEVICE_PROFILE 0x102E
-#define CL_DEVICE_VERSION 0x102F
-#define CL_DEVICE_EXTENSIONS 0x1030
-#define CL_DEVICE_PLATFORM 0x1031
-#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+#define CL_DEVICE_TYPE 0x1000
+#define CL_DEVICE_VENDOR_ID 0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
+#define CL_DEVICE_ADDRESS_BITS 0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT 0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
+#define CL_DEVICE_MAX_SAMPLERS 0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
+#define CL_DEVICE_ENDIAN_LITTLE 0x1026
+#define CL_DEVICE_AVAILABLE 0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A
+#define CL_DEVICE_NAME 0x102B
+#define CL_DEVICE_VENDOR 0x102C
+#define CL_DRIVER_VERSION 0x102D
+#define CL_DEVICE_PROFILE 0x102E
+#define CL_DEVICE_VERSION 0x102F
+#define CL_DEVICE_EXTENSIONS 0x1030
+#define CL_DEVICE_PLATFORM 0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
-#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
-#define CL_DEVICE_OPENCL_C_VERSION 0x103D
-#define CL_DEVICE_LINKER_AVAILABLE 0x103E
-#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
-#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
-#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
-#define CL_DEVICE_PARENT_DEVICE 0x1042
-#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
-#define CL_DEVICE_PARTITION_PROPERTIES 0x1044
-#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
-#define CL_DEVICE_PARTITION_TYPE 0x1046
-#define CL_DEVICE_REFERENCE_COUNT 0x1047
-#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
-#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
-#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
+#define CL_DEVICE_OPENCL_C_VERSION 0x103D
+#define CL_DEVICE_LINKER_AVAILABLE 0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
+#define CL_DEVICE_PARENT_DEVICE 0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES 0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
+#define CL_DEVICE_PARTITION_TYPE 0x1046
+#define CL_DEVICE_REFERENCE_COUNT 0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052
+#define CL_DEVICE_SVM_CAPABILITIES 0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS 0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A
/* cl_device_fp_config - bitfield */
#define CL_FP_DENORM (1 << 0)
@@ -307,6 +343,8 @@ typedef struct _cl_buffer_region {
/* cl_command_queue_properties - bitfield */
#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
+#define CL_QUEUE_ON_DEVICE (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3)
/* cl_context_info */
#define CL_CONTEXT_REFERENCE_COUNT 0x1080
@@ -325,20 +363,27 @@ typedef struct _cl_buffer_region {
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088
/* cl_device_affinity_domain */
-#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
-#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
-#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
-#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
-#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
-#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS (1 << 3)
/* cl_command_queue_info */
#define CL_QUEUE_CONTEXT 0x1090
#define CL_QUEUE_DEVICE 0x1091
#define CL_QUEUE_REFERENCE_COUNT 0x1092
#define CL_QUEUE_PROPERTIES 0x1093
+#define CL_QUEUE_SIZE 0x1094
-/* cl_mem_flags - bitfield */
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
#define CL_MEM_READ_WRITE (1 << 0)
#define CL_MEM_WRITE_ONLY (1 << 1)
#define CL_MEM_READ_ONLY (1 << 2)
@@ -349,6 +394,9 @@ typedef struct _cl_buffer_region {
#define CL_MEM_HOST_WRITE_ONLY (1 << 7)
#define CL_MEM_HOST_READ_ONLY (1 << 8)
#define CL_MEM_HOST_NO_ACCESS (1 << 9)
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12)
/* cl_mem_migration_flags - bitfield */
#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0)
@@ -370,6 +418,11 @@ typedef struct _cl_buffer_region {
#define CL_RGBx 0x10BC
#define CL_DEPTH 0x10BD
#define CL_DEPTH_STENCIL 0x10BE
+#define CL_sRGB 0x10BF
+#define CL_sRGBx 0x10C0
+#define CL_sRGBA 0x10C1
+#define CL_sBGRA 0x10C2
+#define CL_ABGR 0x10C3
/* cl_channel_type */
#define CL_SNORM_INT8 0x10D0
@@ -397,6 +450,7 @@ typedef struct _cl_buffer_region {
#define CL_MEM_OBJECT_IMAGE1D 0x10F4
#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
+#define CL_MEM_OBJECT_PIPE 0x10F7
/* cl_mem_info */
#define CL_MEM_TYPE 0x1100
@@ -408,6 +462,7 @@ typedef struct _cl_buffer_region {
#define CL_MEM_CONTEXT 0x1106
#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
#define CL_MEM_OFFSET 0x1108
+#define CL_MEM_USES_SVM_POINTER 0x1109
/* cl_image_info */
#define CL_IMAGE_FORMAT 0x1110
@@ -421,6 +476,10 @@ typedef struct _cl_buffer_region {
#define CL_IMAGE_BUFFER 0x1118
#define CL_IMAGE_NUM_MIP_LEVELS 0x1119
#define CL_IMAGE_NUM_SAMPLES 0x111A
+
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE 0x1120
+#define CL_PIPE_MAX_PACKETS 0x1121
/* cl_addressing_mode */
#define CL_ADDRESS_NONE 0x1130
@@ -439,6 +498,9 @@ typedef struct _cl_buffer_region {
#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
#define CL_SAMPLER_ADDRESSING_MODE 0x1153
#define CL_SAMPLER_FILTER_MODE 0x1154
+#define CL_SAMPLER_MIP_FILTER_MODE 0x1155
+#define CL_SAMPLER_LOD_MIN 0x1156
+#define CL_SAMPLER_LOD_MAX 0x1157
/* cl_map_flags - bitfield */
#define CL_MAP_READ (1 << 0)
@@ -461,6 +523,7 @@ typedef struct _cl_buffer_region {
#define CL_PROGRAM_BUILD_OPTIONS 0x1182
#define CL_PROGRAM_BUILD_LOG 0x1183
#define CL_PROGRAM_BINARY_TYPE 0x1184
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
/* cl_program_binary_type */
#define CL_PROGRAM_BINARY_TYPE_NONE 0x0
@@ -506,6 +569,7 @@ typedef struct _cl_buffer_region {
#define CL_KERNEL_ARG_TYPE_CONST (1 << 0)
#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1)
#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2)
+#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3)
/* cl_kernel_work_group_info */
#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
@@ -514,6 +578,10 @@ typedef struct _cl_buffer_region {
#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5
+
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7
/* cl_event_info */
#define CL_EVENT_COMMAND_QUEUE 0x11D0
@@ -548,6 +616,11 @@ typedef struct _cl_buffer_region {
#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
#define CL_COMMAND_FILL_BUFFER 0x1207
#define CL_COMMAND_FILL_IMAGE 0x1208
+#define CL_COMMAND_SVM_FREE 0x1209
+#define CL_COMMAND_SVM_MEMCPY 0x120A
+#define CL_COMMAND_SVM_MEMFILL 0x120B
+#define CL_COMMAND_SVM_MAP 0x120C
+#define CL_COMMAND_SVM_UNMAP 0x120D
/* command execution status */
#define CL_COMPLETE 0x0
@@ -563,6 +636,7 @@ typedef struct _cl_buffer_region {
#define CL_PROFILING_COMMAND_SUBMIT 0x1281
#define CL_PROFILING_COMMAND_START 0x1282
#define CL_PROFILING_COMMAND_END 0x1283
+#define CL_PROFILING_COMMAND_COMPLETE 0x1284
/********************************************************************************************************/
@@ -638,10 +712,10 @@ clGetContextInfo(cl_context /* context */,
/* Command Queue APIs */
extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context /* context */,
- cl_device_id /* device */,
- cl_command_queue_properties /* properties */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateCommandQueueWithProperties(cl_context /* context */,
+ cl_device_id /* device */,
+ const cl_queue_properties * /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
@@ -679,6 +753,14 @@ clCreateImage(cl_context /* context */,
void * /* host_ptr */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_uint /* pipe_packet_size */,
+ cl_uint /* pipe_max_packets */,
+ const cl_pipe_properties * /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
@@ -706,19 +788,36 @@ clGetImageInfo(cl_mem /* image */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem /* pipe */,
+ cl_pipe_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
extern CL_API_ENTRY cl_int CL_API_CALL
-clSetMemObjectDestructorCallback( cl_mem /* memobj */,
- void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
- void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1;
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+ void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1;
+/* SVM Allocation APIs */
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context /* context */,
+ cl_svm_mem_flags /* flags */,
+ size_t /* size */,
+ cl_uint /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context /* context */,
+ void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+
/* Sampler APIs */
extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context /* context */,
- cl_bool /* normalized_coords */,
- cl_addressing_mode /* addressing_mode */,
- cl_filter_mode /* filter_mode */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateSamplerWithProperties(cl_context /* context */,
+ const cl_sampler_properties * /* normalized_coords */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
@@ -837,6 +936,17 @@ clSetKernelArg(cl_kernel /* kernel */,
const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel /* kernel */,
+ cl_uint /* arg_index */,
+ const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel /* kernel */,
+ cl_kernel_exec_info /* param_name */,
+ size_t /* param_value_size */,
+ const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelInfo(cl_kernel /* kernel */,
cl_kernel_info /* param_name */,
size_t /* param_value_size */,
@@ -1122,13 +1232,6 @@ clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue /* command_queue */,
- cl_kernel /* kernel */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNativeKernel(cl_command_queue /* command_queue */,
void (CL_CALLBACK * /*user_func*/)(void *),
void * /* args */,
@@ -1141,17 +1244,67 @@ clEnqueueNativeKernel(cl_command_queue /* command_queue */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue /* command_queue */,
+ cl_uint /* num_svm_pointers */,
+ void *[] /* svm_pointers[] */,
+ void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+ cl_uint /* num_svm_pointers */,
+ void *[] /* svm_pointers[] */,
+ void * /* user_data */),
+ void * /* user_data */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue /* command_queue */,
+ cl_bool /* blocking_copy */,
+ void * /* dst_ptr */,
+ const void * /* src_ptr */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue /* command_queue */,
+ void * /* svm_ptr */,
+ const void * /* pattern */,
+ size_t /* pattern_size */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue /* command_queue */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* flags */,
+ void * /* svm_ptr */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue /* command_queue */,
+ void * /* svm_ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+
/* Extension function access
*
@@ -1205,7 +1358,29 @@ clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
+
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context /* context */,
+ cl_device_id /* device */,
+ cl_command_queue_properties /* properties */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context /* context */,
+ cl_bool /* normalized_coords */,
+ cl_addressing_mode /* addressing_mode */,
+ cl_filter_mode /* filter_mode */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
#ifdef __cplusplus
}
#endif
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
deleted file mode 100644
index 38fac19..0000000
--- a/include/CL/cl.hpp
+++ /dev/null
@@ -1,12452 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2013 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/*! \file
- *
- * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and
- * OpenCL 1.2 (rev 15)
- * \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
- *
- * Additions and fixes from:
- * Brian Cole, March 3rd 2010 and April 2012
- * Matt Gruenke, April 2012.
- * Bruce Merry, February 2013.
- * Tom Deakin and Simon McIntosh-Smith, July 2013
- *
- * \version 1.2.6
- * \date August 2013
- *
- * Optional extension support
- *
- * cl
- * cl_ext_device_fission
- * #define USE_CL_DEVICE_FISSION
- */
-
-/*! \mainpage
- * \section intro Introduction
- * For many large applications C++ is the language of choice and so it seems
- * reasonable to define C++ bindings for OpenCL.
- *
- *
- * The interface is contained with a single C++ header file \em cl.hpp and all
- * definitions are contained within the namespace \em cl. There is no additional
- * requirement to include \em cl.h and to use either the C++ or original C
- * bindings it is enough to simply include \em cl.hpp.
- *
- * The bindings themselves are lightweight and correspond closely to the
- * underlying C API. Using the C++ bindings introduces no additional execution
- * overhead.
- *
- * For detail documentation on the bindings see:
- *
- * The OpenCL C++ Wrapper API 1.2 (revision 09)
- * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
- *
- * \section example Example
- *
- * The following example shows a general use case for the C++
- * bindings, including support for the optional exception feature and
- * also the supplied vector and string classes, see following sections for
- * decriptions of these features.
- *
- * \code
- * #define __CL_ENABLE_EXCEPTIONS
- *
- * #if defined(__APPLE__) || defined(__MACOSX)
- * #include <OpenCL/cl.hpp>
- * #else
- * #include <CL/cl.hpp>
- * #endif
- * #include <cstdio>
- * #include <cstdlib>
- * #include <iostream>
- *
- * const char * helloStr = "__kernel void "
- * "hello(void) "
- * "{ "
- * " "
- * "} ";
- *
- * int
- * main(void)
- * {
- * cl_int err = CL_SUCCESS;
- * try {
- *
- * std::vector<cl::Platform> platforms;
- * cl::Platform::get(&platforms);
- * if (platforms.size() == 0) {
- * std::cout << "Platform size 0\n";
- * return -1;
- * }
- *
- * cl_context_properties properties[] =
- * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
- * cl::Context context(CL_DEVICE_TYPE_CPU, properties);
- *
- * std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
- *
- * cl::Program::Sources source(1,
- * std::make_pair(helloStr,strlen(helloStr)));
- * cl::Program program_ = cl::Program(context, source);
- * program_.build(devices);
- *
- * cl::Kernel kernel(program_, "hello", &err);
- *
- * cl::Event event;
- * cl::CommandQueue queue(context, devices[0], 0, &err);
- * queue.enqueueNDRangeKernel(
- * kernel,
- * cl::NullRange,
- * cl::NDRange(4,4),
- * cl::NullRange,
- * NULL,
- * &event);
- *
- * event.wait();
- * }
- * catch (cl::Error err) {
- * std::cerr
- * << "ERROR: "
- * << err.what()
- * << "("
- * << err.err()
- * << ")"
- * << std::endl;
- * }
- *
- * return EXIT_SUCCESS;
- * }
- *
- * \endcode
- *
- */
-#ifndef CL_HPP_
-#define CL_HPP_
-
-#ifdef _WIN32
-
-#include <windows.h>
-#include <malloc.h>
-#include <iterator>
-#include <intrin.h>
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-#include <exception>
-#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
-
-#pragma push_macro("max")
-#undef max
-#if defined(USE_DX_INTEROP)
-#include <CL/cl_d3d10.h>
-#include <CL/cl_dx9_media_sharing.h>
-#endif
-#endif // _WIN32
-
-//
-#if defined(USE_CL_DEVICE_FISSION)
-#include <CL/cl_ext.h>
-#endif
-
-#if defined(__APPLE__) || defined(__MACOSX)
-#include <OpenGL/OpenGL.h>
-#include <OpenCL/opencl.h>
-#include <libkern/OSAtomic.h>
-#else
-#include <GL/gl.h>
-#include <CL/opencl.h>
-#endif // !__APPLE__
-
-// To avoid accidentally taking ownership of core OpenCL types
-// such as cl_kernel constructors are made explicit
-// under OpenCL 1.2
-#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define __CL_EXPLICIT_CONSTRUCTORS explicit
-#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define __CL_EXPLICIT_CONSTRUCTORS
-#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-// Define deprecated prefixes and suffixes to ensure compilation
-// in case they are not pre-defined
-#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-
-#if !defined(CL_CALLBACK)
-#define CL_CALLBACK
-#endif //CL_CALLBACK
-
-#include <utility>
-#include <limits>
-
-#if !defined(__NO_STD_VECTOR)
-#include <vector>
-#endif
-
-#if !defined(__NO_STD_STRING)
-#include <string>
-#endif
-
-#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
-#include <alloca.h>
-
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#endif // linux
-
-#include <cstring>
-
-
-/*! \namespace cl
- *
- * \brief The OpenCL C++ bindings are defined within this namespace.
- *
- */
-namespace cl {
-
-class Memory;
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
-#define __INIT_CL_EXT_FCN_PTR(name) \
- if(!pfn_##name) { \
- pfn_##name = (PFN_##name) \
- clGetExtensionFunctionAddress(#name); \
- if(!pfn_##name) { \
- } \
- }
-#endif // #if defined(CL_VERSION_1_1)
-
-#if defined(CL_VERSION_1_2)
-#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
- if(!pfn_##name) { \
- pfn_##name = (PFN_##name) \
- clGetExtensionFunctionAddressForPlatform(platform, #name); \
- if(!pfn_##name) { \
- } \
- }
-#endif // #if defined(CL_VERSION_1_1)
-
-class Program;
-class Device;
-class Context;
-class CommandQueue;
-class Memory;
-class Buffer;
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-/*! \brief Exception class
- *
- * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
- */
-class Error : public std::exception
-{
-private:
- cl_int err_;
- const char * errStr_;
-public:
- /*! \brief Create a new CL error exception for a given error code
- * and corresponding message.
- *
- * \param err error code value.
- *
- * \param errStr a descriptive string that must remain in scope until
- * handling of the exception has concluded. If set, it
- * will be returned by what().
- */
- Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
- {}
-
- ~Error() throw() {}
-
- /*! \brief Get error string associated with exception
- *
- * \return A memory pointer to the error message string.
- */
- virtual const char * what() const throw ()
- {
- if (errStr_ == NULL) {
- return "empty";
- }
- else {
- return errStr_;
- }
- }
-
- /*! \brief Get error code associated with exception
- *
- * \return The error code.
- */
- cl_int err(void) const { return err_; }
-};
-
-#define __ERR_STR(x) #x
-#else
-#define __ERR_STR(x) NULL
-#endif // __CL_ENABLE_EXCEPTIONS
-
-
-namespace detail
-{
-#if defined(__CL_ENABLE_EXCEPTIONS)
-static inline cl_int errHandler (
- cl_int err,
- const char * errStr = NULL)
-{
- if (err != CL_SUCCESS) {
- throw Error(err, errStr);
- }
- return err;
-}
-#else
-static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
-{
- (void) errStr; // suppress unused variable warning
- return err;
-}
-#endif // __CL_ENABLE_EXCEPTIONS
-}
-
-
-
-//! \cond DOXYGEN_DETAIL
-#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo)
-#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo)
-#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs)
-#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs)
-#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo)
-#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo)
-#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo)
-#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo)
-#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo)
-#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo)
-#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo)
-#if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo)
-#endif // #if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo)
-#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo)
-#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo)
-#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo)
-
-#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext)
-#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType)
-#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats)
-
-#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer)
-#define __COPY_ERR __ERR_STR(cl::copy)
-#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer)
-#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer)
-#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer)
-#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage)
-#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture)
-#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions)
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler)
-#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
-
-#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent)
-#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus)
-#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback)
-#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents)
-
-#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel)
-#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg)
-#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource)
-#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels)
-#endif // #if defined(CL_VERSION_1_2)
-#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram)
-#if defined(CL_VERSION_1_2)
-#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram)
-
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram)
-
-#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue)
-#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty)
-#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer)
-#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect)
-#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer)
-#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect)
-#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer)
-#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect)
-#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer)
-#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage)
-#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage)
-#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage)
-#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage)
-#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer)
-#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage)
-#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer)
-#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage)
-#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject)
-#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel)
-#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask)
-#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel)
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects)
-#endif // #if defined(CL_VERSION_1_2)
-
-#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects)
-#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects)
-
-
-#define __RETAIN_ERR __ERR_STR(Retain Object)
-#define __RELEASE_ERR __ERR_STR(Release Object)
-#define __FLUSH_ERR __ERR_STR(clFlush)
-#define __FINISH_ERR __ERR_STR(clFinish)
-#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error)
-
-/**
- * CL 1.2 version that uses device fission.
- */
-#if defined(CL_VERSION_1_2)
-#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices)
-#else
-#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT)
-#endif // #if defined(CL_VERSION_1_2)
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
-#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker)
-#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents)
-#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier)
-#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler)
-#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D)
-#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D)
-#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D)
-#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D)
-#endif // #if defined(CL_VERSION_1_1)
-
-#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
-//! \endcond
-
-/**
- * CL 1.2 marker and barrier commands
- */
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList)
-#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList)
-#endif // #if defined(CL_VERSION_1_2)
-
-#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
-typedef std::string STRING_CLASS;
-#elif !defined(__USE_DEV_STRING)
-
-/*! \class string
- * \brief Simple string class, that provides a limited subset of std::string
- * functionality but avoids many of the issues that come with that class.
-
- * \note Deprecated. Please use std::string as default or
- * re-define the string class to match the std::string
- * interface by defining STRING_CLASS
- */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-{
-private:
- ::size_t size_;
- char * str_;
-public:
- //! \brief Constructs an empty string, allocating no memory.
- string(void) : size_(0), str_(NULL)
- {
- }
-
- /*! \brief Constructs a string populated from an arbitrary value of
- * specified size.
- *
- * An extra '\0' is added, in case none was contained in str.
- *
- * \param str the initial value of the string instance. Note that '\0'
- * characters receive no special treatment. If NULL,
- * the string is left empty, with a size of 0.
- *
- * \param size the number of characters to copy from str.
- */
- string(const char * str, ::size_t size) :
- size_(size),
- str_(NULL)
- {
- if( size > 0 ) {
- str_ = new char[size_+1];
- if (str_ != NULL) {
- memcpy(str_, str, size_ * sizeof(char));
- str_[size_] = '\0';
- }
- else {
- size_ = 0;
- }
- }
- }
-
- /*! \brief Constructs a string populated from a null-terminated value.
- *
- * \param str the null-terminated initial value of the string instance.
- * If NULL, the string is left empty, with a size of 0.
- */
- string(const char * str) :
- size_(0),
- str_(NULL)
- {
- if( str ) {
- size_= ::strlen(str);
- }
- if( size_ > 0 ) {
- str_ = new char[size_ + 1];
- if (str_ != NULL) {
- memcpy(str_, str, (size_ + 1) * sizeof(char));
- }
- }
- }
-
- void resize( ::size_t n )
- {
- if( size_ == n ) {
- return;
- }
- if (n == 0) {
- if( str_ ) {
- delete [] str_;
- }
- str_ = NULL;
- size_ = 0;
- }
- else {
- char *newString = new char[n + 1];
- int copySize = n;
- if( size_ < n ) {
- copySize = size_;
- }
- size_ = n;
-
- if(str_) {
- memcpy(newString, str_, (copySize + 1) * sizeof(char));
- }
- if( copySize < size_ ) {
- memset(newString + copySize, 0, size_ - copySize);
- }
- newString[size_] = '\0';
-
- delete [] str_;
- str_ = newString;
- }
- }
-
- const char& operator[] ( ::size_t pos ) const
- {
- return str_[pos];
- }
-
- char& operator[] ( ::size_t pos )
- {
- return str_[pos];
- }
-
- /*! \brief Copies the value of another string to this one.
- *
- * \param rhs the string to copy.
- *
- * \returns a reference to the modified instance.
- */
- string& operator=(const string& rhs)
- {
- if (this == &rhs) {
- return *this;
- }
-
- if( str_ != NULL ) {
- delete [] str_;
- str_ = NULL;
- size_ = 0;
- }
-
- if (rhs.size_ == 0 || rhs.str_ == NULL) {
- str_ = NULL;
- size_ = 0;
- }
- else {
- str_ = new char[rhs.size_ + 1];
- size_ = rhs.size_;
-
- if (str_ != NULL) {
- memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
- }
- else {
- size_ = 0;
- }
- }
-
- return *this;
- }
-
- /*! \brief Constructs a string by copying the value of another instance.
- *
- * \param rhs the string to copy.
- */
- string(const string& rhs) :
- size_(0),
- str_(NULL)
- {
- *this = rhs;
- }
-
- //! \brief Destructor - frees memory used to hold the current value.
- ~string()
- {
- delete[] str_;
- str_ = NULL;
- }
-
- //! \brief Queries the length of the string, excluding any added '\0's.
- ::size_t size(void) const { return size_; }
-
- //! \brief Queries the length of the string, excluding any added '\0's.
- ::size_t length(void) const { return size(); }
-
- /*! \brief Returns a pointer to the private copy held by this instance,
- * or "" if empty/unset.
- */
- const char * c_str(void) const { return (str_) ? str_ : "";}
-};
-typedef cl::string STRING_CLASS;
-#endif // #elif !defined(__USE_DEV_STRING)
-
-#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-#define VECTOR_CLASS std::vector
-#elif !defined(__USE_DEV_VECTOR)
-#define VECTOR_CLASS cl::vector
-
-#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
-#define __MAX_DEFAULT_VECTOR_SIZE 10
-#endif
-
-/*! \class vector
- * \brief Fixed sized vector implementation that mirroring
- *
- * \note Deprecated. Please use std::vector as default or
- * re-define the vector class to match the std::vector
- * interface by defining VECTOR_CLASS
-
- * \note Not recommended for use with custom objects as
- * current implementation will construct N elements
- *
- * std::vector functionality.
- * \brief Fixed sized vector compatible with std::vector.
- *
- * \note
- * This differs from std::vector<> not just in memory allocation,
- * but also in terms of when members are constructed, destroyed,
- * and assigned instead of being copy constructed.
- *
- * \param T type of element contained in the vector.
- *
- * \param N maximum size of the vector.
- */
-template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-{
-private:
- T data_[N];
- unsigned int size_;
-
-public:
- //! \brief Constructs an empty vector with no memory allocated.
- vector() :
- size_(static_cast<unsigned int>(0))
- {}
-
- //! \brief Deallocates the vector's memory and destroys all of its elements.
- ~vector()
- {
- clear();
- }
-
- //! \brief Returns the number of elements currently contained.
- unsigned int size(void) const
- {
- return size_;
- }
-
- /*! \brief Empties the vector of all elements.
- * \note
- * This does not deallocate memory but will invoke destructors
- * on contained elements.
- */
- void clear()
- {
- while(!empty()) {
- pop_back();
- }
- }
-
- /*! \brief Appends an element after the last valid element.
- * Calling this on a vector that has reached capacity will throw an
- * exception if exceptions are enabled.
- */
- void push_back (const T& x)
- {
- if (size() < N) {
- new (&data_[size_]) T(x);
- size_++;
- } else {
- detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
- }
- }
-
- /*! \brief Removes the last valid element from the vector.
- * Calling this on an empty vector will throw an exception
- * if exceptions are enabled.
- */
- void pop_back(void)
- {
- if (size_ != 0) {
- --size_;
- data_[size_].~T();
- } else {
- detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
- }
- }
-
- /*! \brief Constructs with a value copied from another.
- *
- * \param vec the vector to copy.
- */
- vector(const vector<T, N>& vec) :
- size_(vec.size_)
- {
- if (size_ != 0) {
- assign(vec.begin(), vec.end());
- }
- }
-
- /*! \brief Constructs with a specified number of initial elements.
- *
- * \param size number of initial elements.
- *
- * \param val value of initial elements.
- */
- vector(unsigned int size, const T& val = T()) :
- size_(0)
- {
- for (unsigned int i = 0; i < size; i++) {
- push_back(val);
- }
- }
-
- /*! \brief Overwrites the current content with that copied from another
- * instance.
- *
- * \param rhs vector to copy.
- *
- * \returns a reference to this.
- */
- vector<T, N>& operator=(const vector<T, N>& rhs)
- {
- if (this == &rhs) {
- return *this;
- }
-
- if (rhs.size_ != 0) {
- assign(rhs.begin(), rhs.end());
- } else {
- clear();
- }
-
- return *this;
- }
-
- /*! \brief Tests equality against another instance.
- *
- * \param vec the vector against which to compare.
- */
- bool operator==(vector<T,N> &vec)
- {
- if (size() != vec.size()) {
- return false;
- }
-
- for( unsigned int i = 0; i < size(); ++i ) {
- if( operator[](i) != vec[i] ) {
- return false;
- }
- }
- return true;
- }
-
- //! \brief Conversion operator to T*.
- operator T* () { return data_; }
-
- //! \brief Conversion operator to const T*.
- operator const T* () const { return data_; }
-
- //! \brief Tests whether this instance has any elements.
- bool empty (void) const
- {
- return size_==0;
- }
-
- //! \brief Returns the maximum number of elements this instance can hold.
- unsigned int max_size (void) const
- {
- return N;
- }
-
- //! \brief Returns the maximum number of elements this instance can hold.
- unsigned int capacity () const
- {
- return N;
- }
-
- /*! \brief Returns a reference to a given element.
- *
- * \param index which element to access. *
- * \note
- * The caller is responsible for ensuring index is >= 0 and < size().
- */
- T& operator[](int index)
- {
- return data_[index];
- }
-
- /*! \brief Returns a const reference to a given element.
- *
- * \param index which element to access.
- *
- * \note
- * The caller is responsible for ensuring index is >= 0 and < size().
- */
- const T& operator[](int index) const
- {
- return data_[index];
- }
-
- /*! \brief Assigns elements of the vector based on a source iterator range.
- *
- * \param start Beginning iterator of source range
- * \param end Enditerator of source range
- *
- * \note
- * Will throw an exception if exceptions are enabled and size exceeded.
- */
- template<class I>
- void assign(I start, I end)
- {
- clear();
- while(start != end) {
- push_back(*start);
- start++;
- }
- }
-
- /*! \class iterator
- * \brief Const iterator class for vectors
- */
- class iterator
- {
- private:
- const vector<T,N> *vec_;
- int index_;
-
- /**
- * Internal iterator constructor to capture reference
- * to the vector it iterates over rather than taking
- * the vector by copy.
- */
- iterator (const vector<T,N> &vec, int index) :
- vec_(&vec)
- {
- if( !vec.empty() ) {
- index_ = index;
- } else {
- index_ = -1;
- }
- }
-
- public:
- iterator(void) :
- index_(-1),
- vec_(NULL)
- {
- }
-
- iterator(const iterator& rhs) :
- vec_(rhs.vec_),
- index_(rhs.index_)
- {
- }
-
- ~iterator(void) {}
-
- static iterator begin(const cl::vector<T,N> &vec)
- {
- iterator i(vec, 0);
-
- return i;
- }
-
- static iterator end(const cl::vector<T,N> &vec)
- {
- iterator i(vec, vec.size());
-
- return i;
- }
-
- bool operator==(iterator i)
- {
- return ((vec_ == i.vec_) &&
- (index_ == i.index_));
- }
-
- bool operator!=(iterator i)
- {
- return (!(*this==i));
- }
-
- iterator& operator++()
- {
- ++index_;
- return *this;
- }
-
- iterator operator++(int)
- {
- iterator retVal(*this);
- ++index_;
- return retVal;
- }
-
- iterator& operator--()
- {
- --index_;
- return *this;
- }
-
- iterator operator--(int)
- {
- iterator retVal(*this);
- --index_;
- return retVal;
- }
-
- const T& operator *() const
- {
- return (*vec_)[index_];
- }
- };
-
- iterator begin(void)
- {
- return iterator::begin(*this);
- }
-
- iterator begin(void) const
- {
- return iterator::begin(*this);
- }
-
- iterator end(void)
- {
- return iterator::end(*this);
- }
-
- iterator end(void) const
- {
- return iterator::end(*this);
- }
-
- T& front(void)
- {
- return data_[0];
- }
-
- T& back(void)
- {
- return data_[size_];
- }
-
- const T& front(void) const
- {
- return data_[0];
- }
-
- const T& back(void) const
- {
- return data_[size_-1];
- }
-};
-#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-
-
-
-
-
-namespace detail {
-#define __DEFAULT_NOT_INITIALIZED 1
-#define __DEFAULT_BEING_INITIALIZED 2
-#define __DEFAULT_INITIALIZED 4
-
- /*
- * Compare and exchange primitives are needed for handling of defaults
- */
- inline int compare_exchange(volatile int * dest, int exchange, int comparand)
- {
-#ifdef _WIN32
- return (int)(InterlockedCompareExchange(
- (volatile long*)dest,
- (long)exchange,
- (long)comparand));
-#elif defined(__APPLE__) || defined(__MACOSX)
- return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
-#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
- return (__sync_val_compare_and_swap(
- dest,
- comparand,
- exchange));
-#endif // !_WIN32
- }
-
- inline void fence() { _mm_mfence(); }
-}; // namespace detail
-
-
-/*! \brief class used to interface between C++ and
- * OpenCL C calls that require arrays of size_t values, whose
- * size is known statically.
- */
-template <int N>
-class size_t
-{
-private:
- ::size_t data_[N];
-
-public:
- //! \brief Initialize size_t to all 0s
- size_t()
- {
- for( int i = 0; i < N; ++i ) {
- data_[i] = 0;
- }
- }
-
- ::size_t& operator[](int index)
- {
- return data_[index];
- }
-
- const ::size_t& operator[](int index) const
- {
- return data_[index];
- }
-
- //! \brief Conversion operator to T*.
- operator ::size_t* () { return data_; }
-
- //! \brief Conversion operator to const T*.
- operator const ::size_t* () const { return data_; }
-};
-
-namespace detail {
-
-// Generic getInfoHelper. The final parameter is used to guide overload
-// resolution: the actual parameter passed is an int, which makes this
-// a worse conversion sequence than a specialization that declares the
-// parameter as an int.
-template<typename Functor, typename T>
-inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
-{
- return f(name, sizeof(T), param, NULL);
-}
-
-// Specialized getInfoHelper for VECTOR_CLASS params
-template <typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
-{
- ::size_t required;
- cl_int err = f(name, 0, NULL, &required);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- T* value = (T*) alloca(required);
- err = f(name, required, value, NULL);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- param->assign(&value[0], &value[required/sizeof(T)]);
- return CL_SUCCESS;
-}
-
-/* Specialization for reference-counted types. This depends on the
- * existence of Wrapper<T>::cl_type, and none of the other types having the
- * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
- * does not work, because when using a derived type (e.g. Context) the generic
- * template will provide a better match.
- */
-template <typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
-{
- ::size_t required;
- cl_int err = f(name, 0, NULL, &required);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- typename T::cl_type * value = (typename T::cl_type *) alloca(required);
- err = f(name, required, value, NULL);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- ::size_t elements = required / sizeof(typename T::cl_type);
- param->assign(&value[0], &value[elements]);
- for (::size_t i = 0; i < elements; i++)
- {
- if (value[i] != NULL)
- {
- err = (*param)[i].retain();
- if (err != CL_SUCCESS) {
- return err;
- }
- }
- }
- return CL_SUCCESS;
-}
-
-// Specialized for getInfo<CL_PROGRAM_BINARIES>
-template <typename Func>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
-{
- cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
-
- if (err != CL_SUCCESS) {
- return err;
- }
-
- return CL_SUCCESS;
-}
-
-// Specialized GetInfoHelper for STRING_CLASS params
-template <typename Func>
-inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
-{
- ::size_t required;
- cl_int err = f(name, 0, NULL, &required);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- char* value = (char*) alloca(required);
- err = f(name, required, value, NULL);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- *param = value;
- return CL_SUCCESS;
-}
-
-// Specialized GetInfoHelper for cl::size_t params
-template <typename Func, ::size_t N>
-inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
-{
- ::size_t required;
- cl_int err = f(name, 0, NULL, &required);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- ::size_t* value = (::size_t*) alloca(required);
- err = f(name, required, value, NULL);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- for(int i = 0; i < N; ++i) {
- (*param)[i] = value[i];
- }
-
- return CL_SUCCESS;
-}
-
-template<typename T> struct ReferenceHandler;
-
-/* Specialization for reference-counted types. This depends on the
- * existence of Wrapper<T>::cl_type, and none of the other types having the
- * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
- * does not work, because when using a derived type (e.g. Context) the generic
- * template will provide a better match.
- */
-template<typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
-{
- typename T::cl_type value;
- cl_int err = f(name, sizeof(value), &value, NULL);
- if (err != CL_SUCCESS) {
- return err;
- }
- *param = value;
- if (value != NULL)
- {
- err = param->retain();
- if (err != CL_SUCCESS) {
- return err;
- }
- }
- return CL_SUCCESS;
-}
-
-#define __PARAM_NAME_INFO_1_0(F) \
- F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
- F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
- F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
- F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
- F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
- \
- F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
- F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
- F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
- F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
- F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
- F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
- F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
- F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
- F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
- F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
- F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
- F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
- F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
- F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
- F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
- F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
- F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
- F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
- F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
- F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
- F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
- F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
- F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
- F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
- F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
- F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
- F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
- F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
- F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
- F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
- F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
- F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
- F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
- F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
- F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
- F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
- F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
- F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
- F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
- F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
- F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
- F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
- F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
- F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
- F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
- F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
- F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
- F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
- F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
- F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
- \
- F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
- F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
- F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
- \
- F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
- F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
- F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
- F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
- \
- F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
- F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
- F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
- F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
- \
- F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
- F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
- F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
- F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
- F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
- F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
- F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
- \
- F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
- F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
- F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
- F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
- F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
- F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
- F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
- \
- F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
- F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
- F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
- F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
- F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
- \
- F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
- F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
- F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
- F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
- F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
- F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
- F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
- \
- F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
- F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
- F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
- \
- F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
- F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
- F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
- F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
- F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
- \
- F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
- F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
- F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
- \
- F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
- F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
- F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
- F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
-
-#if defined(CL_VERSION_1_1)
-#define __PARAM_NAME_INFO_1_1(F) \
- F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
- F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
- F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
- F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
- F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
- F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
- F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
- F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
- F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
- F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
- F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
- F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
- F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
- \
- F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
- F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
- \
- F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
- F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
- \
- F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
-#endif // CL_VERSION_1_1
-
-
-#if defined(CL_VERSION_1_2)
-#define __PARAM_NAME_INFO_1_2(F) \
- F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
- \
- F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
- F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
- \
- F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
- \
- F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
- \
- F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
- F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
- F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
- F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
- \
- F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
- F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
- F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>) \
- F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
- F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
- F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
- F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
-#endif // #if defined(CL_VERSION_1_2)
-
-#if defined(USE_CL_DEVICE_FISSION)
-#define __PARAM_NAME_DEVICE_FISSION(F) \
- F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
- F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
- F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
- F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
- F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
-#endif // USE_CL_DEVICE_FISSION
-
-template <typename enum_type, cl_int Name>
-struct param_traits {};
-
-#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
-struct token; \
-template<> \
-struct param_traits<detail:: token,param_name> \
-{ \
- enum { value = param_name }; \
- typedef T param_type; \
-};
-
-__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
-#if defined(CL_VERSION_1_1)
-__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
-#endif // CL_VERSION_1_1
-#if defined(CL_VERSION_1_2)
-__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
-#endif // CL_VERSION_1_1
-
-#if defined(USE_CL_DEVICE_FISSION)
-__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
-#endif // USE_CL_DEVICE_FISSION
-
-#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
-__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
-#endif
-
-#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
-#endif
-
-#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
-#endif
-#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_SIMD_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
-#endif
-
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_WARP_SIZE_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_GPU_OVERLAP_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
-#endif
-#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
-#endif
-#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
-#endif
-
-// Convenience functions
-
-template <typename Func, typename T>
-inline cl_int
-getInfo(Func f, cl_uint name, T* param)
-{
- return getInfoHelper(f, name, param, 0);
-}
-
-template <typename Func, typename Arg0>
-struct GetInfoFunctor0
-{
- Func f_; const Arg0& arg0_;
- cl_int operator ()(
- cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
- { return f_(arg0_, param, size, value, size_ret); }
-};
-
-template <typename Func, typename Arg0, typename Arg1>
-struct GetInfoFunctor1
-{
- Func f_; const Arg0& arg0_; const Arg1& arg1_;
- cl_int operator ()(
- cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
- { return f_(arg0_, arg1_, param, size, value, size_ret); }
-};
-
-template <typename Func, typename Arg0, typename T>
-inline cl_int
-getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
-{
- GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
- return getInfoHelper(f0, name, param, 0);
-}
-
-template <typename Func, typename Arg0, typename Arg1, typename T>
-inline cl_int
-getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
-{
- GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
- return getInfoHelper(f0, name, param, 0);
-}
-
-template<typename T>
-struct ReferenceHandler
-{ };
-
-#if defined(CL_VERSION_1_2)
-/**
- * OpenCL 1.2 devices do have retain/release.
- */
-template <>
-struct ReferenceHandler<cl_device_id>
-{
- /**
- * Retain the device.
- * \param device A valid device created using createSubDevices
- * \return
- * CL_SUCCESS if the function executed successfully.
- * CL_INVALID_DEVICE if device was not a valid subdevice
- * CL_OUT_OF_RESOURCES
- * CL_OUT_OF_HOST_MEMORY
- */
- static cl_int retain(cl_device_id device)
- { return ::clRetainDevice(device); }
- /**
- * Retain the device.
- * \param device A valid device created using createSubDevices
- * \return
- * CL_SUCCESS if the function executed successfully.
- * CL_INVALID_DEVICE if device was not a valid subdevice
- * CL_OUT_OF_RESOURCES
- * CL_OUT_OF_HOST_MEMORY
- */
- static cl_int release(cl_device_id device)
- { return ::clReleaseDevice(device); }
-};
-#else // #if defined(CL_VERSION_1_2)
-/**
- * OpenCL 1.1 devices do not have retain/release.
- */
-template <>
-struct ReferenceHandler<cl_device_id>
-{
- // cl_device_id does not have retain().
- static cl_int retain(cl_device_id)
- { return CL_SUCCESS; }
- // cl_device_id does not have release().
- static cl_int release(cl_device_id)
- { return CL_SUCCESS; }
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-template <>
-struct ReferenceHandler<cl_platform_id>
-{
- // cl_platform_id does not have retain().
- static cl_int retain(cl_platform_id)
- { return CL_SUCCESS; }
- // cl_platform_id does not have release().
- static cl_int release(cl_platform_id)
- { return CL_SUCCESS; }
-};
-
-template <>
-struct ReferenceHandler<cl_context>
-{
- static cl_int retain(cl_context context)
- { return ::clRetainContext(context); }
- static cl_int release(cl_context context)
- { return ::clReleaseContext(context); }
-};
-
-template <>
-struct ReferenceHandler<cl_command_queue>
-{
- static cl_int retain(cl_command_queue queue)
- { return ::clRetainCommandQueue(queue); }
- static cl_int release(cl_command_queue queue)
- { return ::clReleaseCommandQueue(queue); }
-};
-
-template <>
-struct ReferenceHandler<cl_mem>
-{
- static cl_int retain(cl_mem memory)
- { return ::clRetainMemObject(memory); }
- static cl_int release(cl_mem memory)
- { return ::clReleaseMemObject(memory); }
-};
-
-template <>
-struct ReferenceHandler<cl_sampler>
-{
- static cl_int retain(cl_sampler sampler)
- { return ::clRetainSampler(sampler); }
- static cl_int release(cl_sampler sampler)
- { return ::clReleaseSampler(sampler); }
-};
-
-template <>
-struct ReferenceHandler<cl_program>
-{
- static cl_int retain(cl_program program)
- { return ::clRetainProgram(program); }
- static cl_int release(cl_program program)
- { return ::clReleaseProgram(program); }
-};
-
-template <>
-struct ReferenceHandler<cl_kernel>
-{
- static cl_int retain(cl_kernel kernel)
- { return ::clRetainKernel(kernel); }
- static cl_int release(cl_kernel kernel)
- { return ::clReleaseKernel(kernel); }
-};
-
-template <>
-struct ReferenceHandler<cl_event>
-{
- static cl_int retain(cl_event event)
- { return ::clRetainEvent(event); }
- static cl_int release(cl_event event)
- { return ::clReleaseEvent(event); }
-};
-
-
-// Extracts version number with major in the upper 16 bits, minor in the lower 16
-static cl_uint getVersion(const char *versionInfo)
-{
- int highVersion = 0;
- int lowVersion = 0;
- int index = 7;
- while(versionInfo[index] != '.' ) {
- highVersion *= 10;
- highVersion += versionInfo[index]-'0';
- ++index;
- }
- ++index;
- while(versionInfo[index] != ' ' ) {
- lowVersion *= 10;
- lowVersion += versionInfo[index]-'0';
- ++index;
- }
- return (highVersion << 16) | lowVersion;
-}
-
-static cl_uint getPlatformVersion(cl_platform_id platform)
-{
- ::size_t size = 0;
- clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
- char *versionInfo = (char *) alloca(size);
- clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
- return getVersion(versionInfo);
-}
-
-static cl_uint getDevicePlatformVersion(cl_device_id device)
-{
- cl_platform_id platform;
- clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
- return getPlatformVersion(platform);
-}
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-static cl_uint getContextPlatformVersion(cl_context context)
-{
- // The platform cannot be queried directly, so we first have to grab a
- // device and obtain its context
- ::size_t size = 0;
- clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
- if (size == 0)
- return 0;
- cl_device_id *devices = (cl_device_id *) alloca(size);
- clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
- return getDevicePlatformVersion(devices[0]);
-}
-#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-template <typename T>
-class Wrapper
-{
-public:
- typedef T cl_type;
-
-protected:
- cl_type object_;
-
-public:
- Wrapper() : object_(NULL) { }
-
- Wrapper(const cl_type &obj) : object_(obj) { }
-
- ~Wrapper()
- {
- if (object_ != NULL) { release(); }
- }
-
- Wrapper(const Wrapper<cl_type>& rhs)
- {
- object_ = rhs.object_;
- if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
- }
-
- Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
- {
- if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
- object_ = rhs.object_;
- if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
- return *this;
- }
-
- Wrapper<cl_type>& operator = (const cl_type &rhs)
- {
- if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
- object_ = rhs;
- return *this;
- }
-
- cl_type operator ()() const { return object_; }
-
- cl_type& operator ()() { return object_; }
-
-protected:
- template<typename Func, typename U>
- friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-
- cl_int retain() const
- {
- return ReferenceHandler<cl_type>::retain(object_);
- }
-
- cl_int release() const
- {
- return ReferenceHandler<cl_type>::release(object_);
- }
-};
-
-template <>
-class Wrapper<cl_device_id>
-{
-public:
- typedef cl_device_id cl_type;
-
-protected:
- cl_type object_;
- bool referenceCountable_;
-
- static bool isReferenceCountable(cl_device_id device)
- {
- bool retVal = false;
- if (device != NULL) {
- int version = getDevicePlatformVersion(device);
- if(version > ((1 << 16) + 1)) {
- retVal = true;
- }
- }
- return retVal;
- }
-
-public:
- Wrapper() : object_(NULL), referenceCountable_(false)
- {
- }
-
- Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false)
- {
- referenceCountable_ = isReferenceCountable(obj);
- }
-
- ~Wrapper()
- {
- if (object_ != NULL) { release(); }
- }
-
- Wrapper(const Wrapper<cl_type>& rhs)
- {
- object_ = rhs.object_;
- referenceCountable_ = isReferenceCountable(object_);
- if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
- }
-
- Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
- {
- if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
- object_ = rhs.object_;
- referenceCountable_ = rhs.referenceCountable_;
- if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
- return *this;
- }
-
- Wrapper<cl_type>& operator = (const cl_type &rhs)
- {
- if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
- object_ = rhs;
- referenceCountable_ = isReferenceCountable(object_);
- return *this;
- }
-
- cl_type operator ()() const { return object_; }
-
- cl_type& operator ()() { return object_; }
-
-protected:
- template<typename Func, typename U>
- friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-
- template<typename Func, typename U>
- friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
-
- cl_int retain() const
- {
- if( referenceCountable_ ) {
- return ReferenceHandler<cl_type>::retain(object_);
- }
- else {
- return CL_SUCCESS;
- }
- }
-
- cl_int release() const
- {
- if( referenceCountable_ ) {
- return ReferenceHandler<cl_type>::release(object_);
- }
- else {
- return CL_SUCCESS;
- }
- }
-};
-
-} // namespace detail
-//! \endcond
-
-/*! \stuct ImageFormat
- * \brief Adds constructors and member functions for cl_image_format.
- *
- * \see cl_image_format
- */
-struct ImageFormat : public cl_image_format
-{
- //! \brief Default constructor - performs no initialization.
- ImageFormat(){}
-
- //! \brief Initializing constructor.
- ImageFormat(cl_channel_order order, cl_channel_type type)
- {
- image_channel_order = order;
- image_channel_data_type = type;
- }
-
- //! \brief Assignment operator.
- ImageFormat& operator = (const ImageFormat& rhs)
- {
- if (this != &rhs) {
- this->image_channel_data_type = rhs.image_channel_data_type;
- this->image_channel_order = rhs.image_channel_order;
- }
- return *this;
- }
-};
-
-/*! \brief Class interface for cl_device_id.
- *
- * \note Copies of these objects are inexpensive, since they don't 'own'
- * any underlying resources or data structures.
- *
- * \see cl_device_id
- */
-class Device : public detail::Wrapper<cl_device_id>
-{
-public:
- //! \brief Default constructor - initializes to NULL.
- Device() : detail::Wrapper<cl_type>() { }
-
- /*! \brief Copy constructor.
- *
- * This simply copies the device ID value, which is an inexpensive operation.
- */
- Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
-
- /*! \brief Constructor from cl_device_id.
- *
- * This simply copies the device ID value, which is an inexpensive operation.
- */
- Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
-
- /*! \brief Returns the first device on the default context.
- *
- * \see Context::getDefault()
- */
- static Device getDefault(cl_int * err = NULL);
-
- /*! \brief Assignment operator from Device.
- *
- * This simply copies the device ID value, which is an inexpensive operation.
- */
- Device& operator = (const Device& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment operator from cl_device_id.
- *
- * This simply copies the device ID value, which is an inexpensive operation.
- */
- Device& operator = (const cl_device_id& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetDeviceInfo().
- template <typename T>
- cl_int getInfo(cl_device_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetDeviceInfo, object_, name, param),
- __GET_DEVICE_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetDeviceInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_device_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_device_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- /**
- * CL 1.2 version
- */
-#if defined(CL_VERSION_1_2)
- //! \brief Wrapper for clCreateSubDevicesEXT().
- cl_int createSubDevices(
- const cl_device_partition_property * properties,
- VECTOR_CLASS<Device>* devices)
- {
- cl_uint n = 0;
- cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __CREATE_SUB_DEVICES);
- }
-
- cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
- err = clCreateSubDevices(object_, properties, n, ids, NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __CREATE_SUB_DEVICES);
- }
-
- devices->assign(&ids[0], &ids[n]);
- return CL_SUCCESS;
- }
-#endif // #if defined(CL_VERSION_1_2)
-
-/**
- * CL 1.1 version that uses device fission.
- */
-#if defined(CL_VERSION_1_1)
-#if defined(USE_CL_DEVICE_FISSION)
- cl_int createSubDevices(
- const cl_device_partition_property_ext * properties,
- VECTOR_CLASS<Device>* devices)
- {
- typedef CL_API_ENTRY cl_int
- ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
- cl_device_id /*in_device*/,
- const cl_device_partition_property_ext * /* properties */,
- cl_uint /*num_entries*/,
- cl_device_id * /*out_devices*/,
- cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
- static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
- __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
-
- cl_uint n = 0;
- cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __CREATE_SUB_DEVICES);
- }
-
- cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
- err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __CREATE_SUB_DEVICES);
- }
-
- devices->assign(&ids[0], &ids[n]);
- return CL_SUCCESS;
- }
-#endif // #if defined(USE_CL_DEVICE_FISSION)
-#endif // #if defined(CL_VERSION_1_1)
-};
-
-/*! \brief Class interface for cl_platform_id.
- *
- * \note Copies of these objects are inexpensive, since they don't 'own'
- * any underlying resources or data structures.
- *
- * \see cl_platform_id
- */
-class Platform : public detail::Wrapper<cl_platform_id>
-{
-public:
- //! \brief Default constructor - initializes to NULL.
- Platform() : detail::Wrapper<cl_type>() { }
-
- /*! \brief Copy constructor.
- *
- * This simply copies the platform ID value, which is an inexpensive operation.
- */
- Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
-
- /*! \brief Constructor from cl_platform_id.
- *
- * This simply copies the platform ID value, which is an inexpensive operation.
- */
- Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
-
- /*! \brief Assignment operator from Platform.
- *
- * This simply copies the platform ID value, which is an inexpensive operation.
- */
- Platform& operator = (const Platform& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment operator from cl_platform_id.
- *
- * This simply copies the platform ID value, which is an inexpensive operation.
- */
- Platform& operator = (const cl_platform_id& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetPlatformInfo().
- cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetPlatformInfo, object_, name, param),
- __GET_PLATFORM_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetPlatformInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_platform_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_platform_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- /*! \brief Gets a list of devices for this platform.
- *
- * Wraps clGetDeviceIDs().
- */
- cl_int getDevices(
- cl_device_type type,
- VECTOR_CLASS<Device>* devices) const
- {
- cl_uint n = 0;
- if( devices == NULL ) {
- return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
- }
- cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
- }
-
- cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
- err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
- }
-
- devices->assign(&ids[0], &ids[n]);
- return CL_SUCCESS;
- }
-
-#if defined(USE_DX_INTEROP)
- /*! \brief Get the list of available D3D10 devices.
- *
- * \param d3d_device_source.
- *
- * \param d3d_object.
- *
- * \param d3d_device_set.
- *
- * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
- * values returned in devices can be used to identify a specific OpenCL
- * device. If \a devices argument is NULL, this argument is ignored.
- *
- * \return One of the following values:
- * - CL_SUCCESS if the function is executed successfully.
- *
- * The application can query specific capabilities of the OpenCL device(s)
- * returned by cl::getDevices. This can be used by the application to
- * determine which device(s) to use.
- *
- * \note In the case that exceptions are enabled and a return value
- * other than CL_SUCCESS is generated, then cl::Error exception is
- * generated.
- */
- cl_int getDevices(
- cl_d3d10_device_source_khr d3d_device_source,
- void * d3d_object,
- cl_d3d10_device_set_khr d3d_device_set,
- VECTOR_CLASS<Device>* devices) const
- {
- typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
- cl_platform_id platform,
- cl_d3d10_device_source_khr d3d_device_source,
- void * d3d_object,
- cl_d3d10_device_set_khr d3d_device_set,
- cl_uint num_entries,
- cl_device_id * devices,
- cl_uint* num_devices);
-
- if( devices == NULL ) {
- return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
- }
-
- static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
- __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
-
- cl_uint n = 0;
- cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
- object_,
- d3d_device_source,
- d3d_object,
- d3d_device_set,
- 0,
- NULL,
- &n);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
- }
-
- cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
- err = pfn_clGetDeviceIDsFromD3D10KHR(
- object_,
- d3d_device_source,
- d3d_object,
- d3d_device_set,
- n,
- ids,
- NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
- }
-
- devices->assign(&ids[0], &ids[n]);
- return CL_SUCCESS;
- }
-#endif
-
- /*! \brief Gets a list of available platforms.
- *
- * Wraps clGetPlatformIDs().
- */
- static cl_int get(
- VECTOR_CLASS<Platform>* platforms)
- {
- cl_uint n = 0;
-
- if( platforms == NULL ) {
- return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
- }
-
- cl_int err = ::clGetPlatformIDs(0, NULL, &n);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
- }
-
- cl_platform_id* ids = (cl_platform_id*) alloca(
- n * sizeof(cl_platform_id));
- err = ::clGetPlatformIDs(n, ids, NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
- }
-
- platforms->assign(&ids[0], &ids[n]);
- return CL_SUCCESS;
- }
-
- /*! \brief Gets the first available platform.
- *
- * Wraps clGetPlatformIDs(), returning the first result.
- */
- static cl_int get(
- Platform * platform)
- {
- cl_uint n = 0;
-
- if( platform == NULL ) {
- return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
- }
-
- cl_int err = ::clGetPlatformIDs(0, NULL, &n);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
- }
-
- cl_platform_id* ids = (cl_platform_id*) alloca(
- n * sizeof(cl_platform_id));
- err = ::clGetPlatformIDs(n, ids, NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
- }
-
- *platform = ids[0];
- return CL_SUCCESS;
- }
-
- /*! \brief Gets the first available platform, returning it by value.
- *
- * Wraps clGetPlatformIDs(), returning the first result.
- */
- static Platform get(
- cl_int * errResult = NULL)
- {
- Platform platform;
- cl_uint n = 0;
- cl_int err = ::clGetPlatformIDs(0, NULL, &n);
- if (err != CL_SUCCESS) {
- detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
- if (errResult != NULL) {
- *errResult = err;
- }
- }
-
- cl_platform_id* ids = (cl_platform_id*) alloca(
- n * sizeof(cl_platform_id));
- err = ::clGetPlatformIDs(n, ids, NULL);
-
- if (err != CL_SUCCESS) {
- detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
- }
-
- if (errResult != NULL) {
- *errResult = err;
- }
-
- return ids[0];
- }
-
- static Platform getDefault(
- cl_int *errResult = NULL )
- {
- return get(errResult);
- }
-
-
-#if defined(CL_VERSION_1_2)
- //! \brief Wrapper for clUnloadCompiler().
- cl_int
- unloadCompiler()
- {
- return ::clUnloadPlatformCompiler(object_);
- }
-#endif // #if defined(CL_VERSION_1_2)
-}; // class Platform
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
-/**
- * Unload the OpenCL compiler.
- * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
- */
-inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
-UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-inline cl_int
-UnloadCompiler()
-{
- return ::clUnloadCompiler();
-}
-#endif // #if defined(CL_VERSION_1_1)
-
-/*! \brief Class interface for cl_context.
- *
- * \note Copies of these objects are shallow, meaning that the copy will refer
- * to the same underlying cl_context as the original. For details, see
- * clRetainContext() and clReleaseContext().
- *
- * \see cl_context
- */
-class Context
- : public detail::Wrapper<cl_context>
-{
-private:
- static volatile int default_initialized_;
- static Context default_;
- static volatile cl_int default_error_;
-public:
- /*! \brief Destructor.
- *
- * This calls clReleaseContext() on the value held by this instance.
- */
- ~Context() { }
-
- /*! \brief Constructs a context including a list of specified devices.
- *
- * Wraps clCreateContext().
- */
- Context(
- const VECTOR_CLASS<Device>& devices,
- cl_context_properties* properties = NULL,
- void (CL_CALLBACK * notifyFptr)(
- const char *,
- const void *,
- ::size_t,
- void *) = NULL,
- void* data = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
-
- ::size_t numDevices = devices.size();
- cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
- for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
- deviceIDs[deviceIndex] = (devices[deviceIndex])();
- }
-
- object_ = ::clCreateContext(
- properties, (cl_uint) numDevices,
- deviceIDs,
- notifyFptr, data, &error);
-
- detail::errHandler(error, __CREATE_CONTEXT_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- Context(
- const Device& device,
- cl_context_properties* properties = NULL,
- void (CL_CALLBACK * notifyFptr)(
- const char *,
- const void *,
- ::size_t,
- void *) = NULL,
- void* data = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
-
- cl_device_id deviceID = device();
-
- object_ = ::clCreateContext(
- properties, 1,
- &deviceID,
- notifyFptr, data, &error);
-
- detail::errHandler(error, __CREATE_CONTEXT_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- /*! \brief Constructs a context including all or a subset of devices of a specified type.
- *
- * Wraps clCreateContextFromType().
- */
- Context(
- cl_device_type type,
- cl_context_properties* properties = NULL,
- void (CL_CALLBACK * notifyFptr)(
- const char *,
- const void *,
- ::size_t,
- void *) = NULL,
- void* data = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
-
-#if !defined(__APPLE__) || !defined(__MACOS)
- cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
-
- if (properties == NULL) {
- // Get a valid platform ID as we cannot send in a blank one
- VECTOR_CLASS<Platform> platforms;
- error = Platform::get(&platforms);
- if (error != CL_SUCCESS) {
- detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
- if (err != NULL) {
- *err = error;
- }
- return;
- }
-
- // Check the platforms we found for a device of our specified type
- cl_context_properties platform_id = 0;
- for (unsigned int i = 0; i < platforms.size(); i++) {
-
- VECTOR_CLASS<Device> devices;
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
- try {
-#endif
-
- error = platforms[i].getDevices(type, &devices);
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
- } catch (Error) {}
- // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
- // We do error checking next anyway, and can throw there if needed
-#endif
-
- // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
- if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
- detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- if (devices.size() > 0) {
- platform_id = (cl_context_properties)platforms[i]();
- break;
- }
- }
-
- if (platform_id == 0) {
- detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
- if (err != NULL) {
- *err = CL_DEVICE_NOT_FOUND;
- }
- return;
- }
-
- prop[1] = platform_id;
- properties = &prop[0];
- }
-#endif
- object_ = ::clCreateContextFromType(
- properties, type, notifyFptr, data, &error);
-
- detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
- *
- * \note All calls to this function return the same cl_context as the first.
- */
- static Context getDefault(cl_int * err = NULL)
- {
- int state = detail::compare_exchange(
- &default_initialized_,
- __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
-
- if (state & __DEFAULT_INITIALIZED) {
- if (err != NULL) {
- *err = default_error_;
- }
- return default_;
- }
-
- if (state & __DEFAULT_BEING_INITIALIZED) {
- // Assume writes will propagate eventually...
- while(default_initialized_ != __DEFAULT_INITIALIZED) {
- detail::fence();
- }
-
- if (err != NULL) {
- *err = default_error_;
- }
- return default_;
- }
-
- cl_int error;
- default_ = Context(
- CL_DEVICE_TYPE_DEFAULT,
- NULL,
- NULL,
- NULL,
- &error);
-
- detail::fence();
-
- default_error_ = error;
- // Assume writes will propagate eventually...
- default_initialized_ = __DEFAULT_INITIALIZED;
-
- detail::fence();
-
- if (err != NULL) {
- *err = default_error_;
- }
- return default_;
-
- }
-
- //! \brief Default constructor - initializes to NULL.
- Context() : detail::Wrapper<cl_type>() { }
-
- /*! \brief Copy constructor.
- *
- * This calls clRetainContext() on the parameter's cl_context.
- */
- Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
-
- /*! \brief Constructor from cl_context - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the cl_context
- * into the new Context object.
- */
- __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
-
- /*! \brief Assignment operator from Context.
- *
- * This calls clRetainContext() on the parameter and clReleaseContext() on
- * the previous value held by this instance.
- */
- Context& operator = (const Context& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment operator from cl_context - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the rhs and calls
- * clReleaseContext() on the value previously held by this instance.
- */
- Context& operator = (const cl_context& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetContextInfo().
- template <typename T>
- cl_int getInfo(cl_context_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetContextInfo, object_, name, param),
- __GET_CONTEXT_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetContextInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_context_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_context_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- /*! \brief Gets a list of supported image formats.
- *
- * Wraps clGetSupportedImageFormats().
- */
- cl_int getSupportedImageFormats(
- cl_mem_flags flags,
- cl_mem_object_type type,
- VECTOR_CLASS<ImageFormat>* formats) const
- {
- cl_uint numEntries;
- cl_int err = ::clGetSupportedImageFormats(
- object_,
- flags,
- type,
- 0,
- NULL,
- &numEntries);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
- }
-
- ImageFormat* value = (ImageFormat*)
- alloca(numEntries * sizeof(ImageFormat));
- err = ::clGetSupportedImageFormats(
- object_,
- flags,
- type,
- numEntries,
- (cl_image_format*) value,
- NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
- }
-
- formats->assign(&value[0], &value[numEntries]);
- return CL_SUCCESS;
- }
-};
-
-inline Device Device::getDefault(cl_int * err)
-{
- cl_int error;
- Device device;
-
- Context context = Context::getDefault(&error);
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
- if (error != CL_SUCCESS) {
- if (err != NULL) {
- *err = error;
- }
- }
- else {
- device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
- if (err != NULL) {
- *err = CL_SUCCESS;
- }
- }
-
- return device;
-}
-
-
-#ifdef _WIN32
-__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__declspec(selectany) Context Context::default_;
-__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
-#else
-__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__attribute__((weak)) Context Context::default_;
-__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
-#endif
-
-/*! \brief Class interface for cl_event.
- *
- * \note Copies of these objects are shallow, meaning that the copy will refer
- * to the same underlying cl_event as the original. For details, see
- * clRetainEvent() and clReleaseEvent().
- *
- * \see cl_event
- */
-class Event : public detail::Wrapper<cl_event>
-{
-public:
- /*! \brief Destructor.
- *
- * This calls clReleaseEvent() on the value held by this instance.
- */
- ~Event() { }
-
- //! \brief Default constructor - initializes to NULL.
- Event() : detail::Wrapper<cl_type>() { }
-
- /*! \brief Copy constructor.
- *
- * This calls clRetainEvent() on the parameter's cl_event.
- */
- Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
-
- /*! \brief Constructor from cl_event - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the cl_event
- * into the new Event object.
- */
- Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
-
- /*! \brief Assignment operator from cl_event - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the rhs and calls
- * clReleaseEvent() on the value previously held by this instance.
- */
- Event& operator = (const Event& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment operator from cl_event.
- *
- * This calls clRetainEvent() on the parameter and clReleaseEvent() on
- * the previous value held by this instance.
- */
- Event& operator = (const cl_event& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetEventInfo().
- template <typename T>
- cl_int getInfo(cl_event_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetEventInfo, object_, name, param),
- __GET_EVENT_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetEventInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_event_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_event_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- //! \brief Wrapper for clGetEventProfilingInfo().
- template <typename T>
- cl_int getProfilingInfo(cl_profiling_info name, T* param) const
- {
- return detail::errHandler(detail::getInfo(
- &::clGetEventProfilingInfo, object_, name, param),
- __GET_EVENT_PROFILE_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_profiling_info, name>::param_type
- getProfilingInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_profiling_info, name>::param_type param;
- cl_int result = getProfilingInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- /*! \brief Blocks the calling thread until this event completes.
- *
- * Wraps clWaitForEvents().
- */
- cl_int wait() const
- {
- return detail::errHandler(
- ::clWaitForEvents(1, &object_),
- __WAIT_FOR_EVENTS_ERR);
- }
-
-#if defined(CL_VERSION_1_1)
- /*! \brief Registers a user callback function for a specific command execution status.
- *
- * Wraps clSetEventCallback().
- */
- cl_int setCallback(
- cl_int type,
- void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
- void * user_data = NULL)
- {
- return detail::errHandler(
- ::clSetEventCallback(
- object_,
- type,
- pfn_notify,
- user_data),
- __SET_EVENT_CALLBACK_ERR);
- }
-#endif
-
- /*! \brief Blocks the calling thread until every event specified is complete.
- *
- * Wraps clWaitForEvents().
- */
- static cl_int
- waitForEvents(const VECTOR_CLASS<Event>& events)
- {
- return detail::errHandler(
- ::clWaitForEvents(
- (cl_uint) events.size(), (cl_event*)&events.front()),
- __WAIT_FOR_EVENTS_ERR);
- }
-};
-
-#if defined(CL_VERSION_1_1)
-/*! \brief Class interface for user events (a subset of cl_event's).
- *
- * See Event for details about copy semantics, etc.
- */
-class UserEvent : public Event
-{
-public:
- /*! \brief Constructs a user event on a given context.
- *
- * Wraps clCreateUserEvent().
- */
- UserEvent(
- const Context& context,
- cl_int * err = NULL)
- {
- cl_int error;
- object_ = ::clCreateUserEvent(
- context(),
- &error);
-
- detail::errHandler(error, __CREATE_USER_EVENT_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- //! \brief Default constructor - initializes to NULL.
- UserEvent() : Event() { }
-
- //! \brief Copy constructor - performs shallow copy.
- UserEvent(const UserEvent& event) : Event(event) { }
-
- //! \brief Assignment Operator - performs shallow copy.
- UserEvent& operator = (const UserEvent& rhs)
- {
- if (this != &rhs) {
- Event::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Sets the execution status of a user event object.
- *
- * Wraps clSetUserEventStatus().
- */
- cl_int setStatus(cl_int status)
- {
- return detail::errHandler(
- ::clSetUserEventStatus(object_,status),
- __SET_USER_EVENT_STATUS_ERR);
- }
-};
-#endif
-
-/*! \brief Blocks the calling thread until every event specified is complete.
- *
- * Wraps clWaitForEvents().
- */
-inline static cl_int
-WaitForEvents(const VECTOR_CLASS<Event>& events)
-{
- return detail::errHandler(
- ::clWaitForEvents(
- (cl_uint) events.size(), (cl_event*)&events.front()),
- __WAIT_FOR_EVENTS_ERR);
-}
-
-/*! \brief Class interface for cl_mem.
- *
- * \note Copies of these objects are shallow, meaning that the copy will refer
- * to the same underlying cl_mem as the original. For details, see
- * clRetainMemObject() and clReleaseMemObject().
- *
- * \see cl_mem
- */
-class Memory : public detail::Wrapper<cl_mem>
-{
-public:
-
- /*! \brief Destructor.
- *
- * This calls clReleaseMemObject() on the value held by this instance.
- */
- ~Memory() {}
-
- //! \brief Default constructor - initializes to NULL.
- Memory() : detail::Wrapper<cl_type>() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * This calls clRetainMemObject() on the parameter's cl_mem.
- */
- Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the cl_mem
- * into the new Memory object.
- */
- __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
-
- /*! \brief Assignment operator from Memory.
- *
- * This calls clRetainMemObject() on the parameter and clReleaseMemObject()
- * on the previous value held by this instance.
- */
- Memory& operator = (const Memory& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment operator from cl_mem - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the rhs and calls
- * clReleaseMemObject() on the value previously held by this instance.
- */
- Memory& operator = (const cl_mem& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetMemObjectInfo().
- template <typename T>
- cl_int getInfo(cl_mem_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
- __GET_MEM_OBJECT_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_mem_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_mem_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
-#if defined(CL_VERSION_1_1)
- /*! \brief Registers a callback function to be called when the memory object
- * is no longer needed.
- *
- * Wraps clSetMemObjectDestructorCallback().
- *
- * Repeated calls to this function, for a given cl_mem value, will append
- * to the list of functions called (in reverse order) when memory object's
- * resources are freed and the memory object is deleted.
- *
- * \note
- * The registered callbacks are associated with the underlying cl_mem
- * value - not the Memory class instance.
- */
- cl_int setDestructorCallback(
- void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
- void * user_data = NULL)
- {
- return detail::errHandler(
- ::clSetMemObjectDestructorCallback(
- object_,
- pfn_notify,
- user_data),
- __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
- }
-#endif
-
-};
-
-// Pre-declare copy functions
-class Buffer;
-template< typename IteratorType >
-cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
-template< typename IteratorType >
-cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
-template< typename IteratorType >
-cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
-template< typename IteratorType >
-cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
-
-
-/*! \brief Class interface for Buffer Memory Objects.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class Buffer : public Memory
-{
-public:
-
- /*! \brief Constructs a Buffer in a specified context.
- *
- * Wraps clCreateBuffer().
- *
- * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
- * specified. Note alignment & exclusivity requirements.
- */
- Buffer(
- const Context& context,
- cl_mem_flags flags,
- ::size_t size,
- void* host_ptr = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
- object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-
- detail::errHandler(error, __CREATE_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- /*! \brief Constructs a Buffer in the default context.
- *
- * Wraps clCreateBuffer().
- *
- * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
- * specified. Note alignment & exclusivity requirements.
- *
- * \see Context::getDefault()
- */
- Buffer(
- cl_mem_flags flags,
- ::size_t size,
- void* host_ptr = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
-
- Context context = Context::getDefault(err);
-
- object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-
- detail::errHandler(error, __CREATE_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- /*!
- * \brief Construct a Buffer from a host container via iterators.
- * IteratorType must be random access.
- * If useHostPtr is specified iterators must represent contiguous data.
- */
- template< typename IteratorType >
- Buffer(
- IteratorType startIterator,
- IteratorType endIterator,
- bool readOnly,
- bool useHostPtr = false,
- cl_int* err = NULL)
- {
- typedef typename std::iterator_traits<IteratorType>::value_type DataType;
- cl_int error;
-
- cl_mem_flags flags = 0;
- if( readOnly ) {
- flags |= CL_MEM_READ_ONLY;
- }
- else {
- flags |= CL_MEM_READ_WRITE;
- }
- if( useHostPtr ) {
- flags |= CL_MEM_USE_HOST_PTR;
- }
-
- ::size_t size = sizeof(DataType)*(endIterator - startIterator);
-
- Context context = Context::getDefault(err);
-
- if( useHostPtr ) {
- object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
- } else {
- object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
- }
-
- detail::errHandler(error, __CREATE_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
-
- if( !useHostPtr ) {
- error = cl::copy(startIterator, endIterator, *this);
- detail::errHandler(error, __CREATE_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
- }
-
- /*!
- * \brief Construct a Buffer from a host container via iterators using a specified context.
- * IteratorType must be random access.
- * If useHostPtr is specified iterators must represent contiguous data.
- */
- template< typename IteratorType >
- Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
- bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
-
- //! \brief Default constructor - initializes to NULL.
- Buffer() : Memory() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- Buffer(const Buffer& buffer) : Memory(buffer) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
-
- /*! \brief Assignment from Buffer - performs shallow copy.
- *
- * See Memory for further details.
- */
- Buffer& operator = (const Buffer& rhs)
- {
- if (this != &rhs) {
- Memory::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- Buffer& operator = (const cl_mem& rhs)
- {
- Memory::operator=(rhs);
- return *this;
- }
-
-#if defined(CL_VERSION_1_1)
- /*! \brief Creates a new buffer object from this.
- *
- * Wraps clCreateSubBuffer().
- */
- Buffer createSubBuffer(
- cl_mem_flags flags,
- cl_buffer_create_type buffer_create_type,
- const void * buffer_create_info,
- cl_int * err = NULL)
- {
- Buffer result;
- cl_int error;
- result.object_ = ::clCreateSubBuffer(
- object_,
- flags,
- buffer_create_type,
- buffer_create_info,
- &error);
-
- detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
-
- return result;
- }
-#endif
-};
-
-#if defined (USE_DX_INTEROP)
-/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
- *
- * This is provided to facilitate interoperability with Direct3D.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class BufferD3D10 : public Buffer
-{
-public:
- typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
- cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer,
- cl_int* errcode_ret);
-
- /*! \brief Constructs a BufferD3D10, in a specified context, from a
- * given ID3D10Buffer.
- *
- * Wraps clCreateFromD3D10BufferKHR().
- */
- BufferD3D10(
- const Context& context,
- cl_mem_flags flags,
- ID3D10Buffer* bufobj,
- cl_int * err = NULL)
- {
- static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
-
-#if defined(CL_VERSION_1_2)
- vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
- cl_platform platform = -1;
- for( int i = 0; i < props.size(); ++i ) {
- if( props[i] == CL_CONTEXT_PLATFORM ) {
- platform = props[i+1];
- }
- }
- __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
-#endif
-#if defined(CL_VERSION_1_1)
- __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
-#endif
-
- cl_int error;
- object_ = pfn_clCreateFromD3D10BufferKHR(
- context(),
- flags,
- bufobj,
- &error);
-
- detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- //! \brief Default constructor - initializes to NULL.
- BufferD3D10() : Buffer() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
-
- /*! \brief Assignment from BufferD3D10 - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferD3D10& operator = (const BufferD3D10& rhs)
- {
- if (this != &rhs) {
- Buffer::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferD3D10& operator = (const cl_mem& rhs)
- {
- Buffer::operator=(rhs);
- return *this;
- }
-};
-#endif
-
-/*! \brief Class interface for GL Buffer Memory Objects.
- *
- * This is provided to facilitate interoperability with OpenGL.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class BufferGL : public Buffer
-{
-public:
- /*! \brief Constructs a BufferGL in a specified context, from a given
- * GL buffer.
- *
- * Wraps clCreateFromGLBuffer().
- */
- BufferGL(
- const Context& context,
- cl_mem_flags flags,
- GLuint bufobj,
- cl_int * err = NULL)
- {
- cl_int error;
- object_ = ::clCreateFromGLBuffer(
- context(),
- flags,
- bufobj,
- &error);
-
- detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- //! \brief Default constructor - initializes to NULL.
- BufferGL() : Buffer() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
-
- /*! \brief Assignment from BufferGL - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferGL& operator = (const BufferGL& rhs)
- {
- if (this != &rhs) {
- Buffer::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferGL& operator = (const cl_mem& rhs)
- {
- Buffer::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetGLObjectInfo().
- cl_int getObjectInfo(
- cl_gl_object_type *type,
- GLuint * gl_object_name)
- {
- return detail::errHandler(
- ::clGetGLObjectInfo(object_,type,gl_object_name),
- __GET_GL_OBJECT_INFO_ERR);
- }
-};
-
-/*! \brief Class interface for GL Render Buffer Memory Objects.
- *
- * This is provided to facilitate interoperability with OpenGL.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class BufferRenderGL : public Buffer
-{
-public:
- /*! \brief Constructs a BufferRenderGL in a specified context, from a given
- * GL Renderbuffer.
- *
- * Wraps clCreateFromGLRenderbuffer().
- */
- BufferRenderGL(
- const Context& context,
- cl_mem_flags flags,
- GLuint bufobj,
- cl_int * err = NULL)
- {
- cl_int error;
- object_ = ::clCreateFromGLRenderbuffer(
- context(),
- flags,
- bufobj,
- &error);
-
- detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- //! \brief Default constructor - initializes to NULL.
- BufferRenderGL() : Buffer() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
-
- /*! \brief Assignment from BufferGL - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferRenderGL& operator = (const BufferRenderGL& rhs)
- {
- if (this != &rhs) {
- Buffer::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- BufferRenderGL& operator = (const cl_mem& rhs)
- {
- Buffer::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetGLObjectInfo().
- cl_int getObjectInfo(
- cl_gl_object_type *type,
- GLuint * gl_object_name)
- {
- return detail::errHandler(
- ::clGetGLObjectInfo(object_,type,gl_object_name),
- __GET_GL_OBJECT_INFO_ERR);
- }
-};
-
-/*! \brief C++ base class for Image Memory objects.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class Image : public Memory
-{
-protected:
- //! \brief Default constructor - initializes to NULL.
- Image() : Memory() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image(const Image& image) : Memory(image) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
-
- /*! \brief Assignment from Image - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image& operator = (const Image& rhs)
- {
- if (this != &rhs) {
- Memory::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image& operator = (const cl_mem& rhs)
- {
- Memory::operator=(rhs);
- return *this;
- }
-
-public:
- //! \brief Wrapper for clGetImageInfo().
- template <typename T>
- cl_int getImageInfo(cl_image_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetImageInfo, object_, name, param),
- __GET_IMAGE_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetImageInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_image_info, name>::param_type
- getImageInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_image_info, name>::param_type param;
- cl_int result = getImageInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-};
-
-#if defined(CL_VERSION_1_2)
-/*! \brief Class interface for 1D Image Memory objects.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class Image1D : public Image
-{
-public:
- /*! \brief Constructs a 1D Image in a specified context.
- *
- * Wraps clCreateImage().
- */
- Image1D(
- const Context& context,
- cl_mem_flags flags,
- ImageFormat format,
- ::size_t width,
- void* host_ptr = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
- cl_image_desc desc =
- {
- CL_MEM_OBJECT_IMAGE1D,
- width,
- 0, 0, 0, 0, 0, 0, 0, 0
- };
- object_ = ::clCreateImage(
- context(),
- flags,
- &format,
- &desc,
- host_ptr,
- &error);
-
- detail::errHandler(error, __CREATE_IMAGE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- //! \brief Default constructor - initializes to NULL.
- Image1D() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image1D(const Image1D& image1D) : Image(image1D) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
-
- /*! \brief Assignment from Image1D - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image1D& operator = (const Image1D& rhs)
- {
- if (this != &rhs) {
- Image::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image1D& operator = (const cl_mem& rhs)
- {
- Image::operator=(rhs);
- return *this;
- }
-};
-
-/*! \class Image1DBuffer
- * \brief Image interface for 1D buffer images.
- */
-class Image1DBuffer : public Image
-{
-public:
- Image1DBuffer(
- const Context& context,
- cl_mem_flags flags,
- ImageFormat format,
- ::size_t width,
- const Buffer &buffer,
- cl_int* err = NULL)
- {
- cl_int error;
- cl_image_desc desc =
- {
- CL_MEM_OBJECT_IMAGE1D_BUFFER,
- width,
- 0, 0, 0, 0, 0, 0, 0,
- buffer()
- };
- object_ = ::clCreateImage(
- context(),
- flags,
- &format,
- &desc,
- NULL,
- &error);
-
- detail::errHandler(error, __CREATE_IMAGE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- Image1DBuffer() { }
-
- Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
-
- __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
-
- Image1DBuffer& operator = (const Image1DBuffer& rhs)
- {
- if (this != &rhs) {
- Image::operator=(rhs);
- }
- return *this;
- }
-
- Image1DBuffer& operator = (const cl_mem& rhs)
- {
- Image::operator=(rhs);
- return *this;
- }
-};
-
-/*! \class Image1DArray
- * \brief Image interface for arrays of 1D images.
- */
-class Image1DArray : public Image
-{
-public:
- Image1DArray(
- const Context& context,
- cl_mem_flags flags,
- ImageFormat format,
- ::size_t arraySize,
- ::size_t width,
- ::size_t rowPitch,
- void* host_ptr = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
- cl_image_desc desc =
- {
- CL_MEM_OBJECT_IMAGE1D_ARRAY,
- width,
- 0, 0, // height, depth (unused)
- arraySize,
- rowPitch,
- 0, 0, 0, 0
- };
- object_ = ::clCreateImage(
- context(),
- flags,
- &format,
- &desc,
- host_ptr,
- &error);
-
- detail::errHandler(error, __CREATE_IMAGE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- Image1DArray() { }
-
- Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
-
- __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
-
- Image1DArray& operator = (const Image1DArray& rhs)
- {
- if (this != &rhs) {
- Image::operator=(rhs);
- }
- return *this;
- }
-
- Image1DArray& operator = (const cl_mem& rhs)
- {
- Image::operator=(rhs);
- return *this;
- }
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-
-/*! \brief Class interface for 2D Image Memory objects.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class Image2D : public Image
-{
-public:
- /*! \brief Constructs a 1D Image in a specified context.
- *
- * Wraps clCreateImage().
- */
- Image2D(
- const Context& context,
- cl_mem_flags flags,
- ImageFormat format,
- ::size_t width,
- ::size_t height,
- ::size_t row_pitch = 0,
- void* host_ptr = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
- bool useCreateImage;
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
- // Run-time decision based on the actual platform
- {
- cl_uint version = detail::getContextPlatformVersion(context());
- useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
- }
-#elif defined(CL_VERSION_1_2)
- useCreateImage = true;
-#else
- useCreateImage = false;
-#endif
-
-#if defined(CL_VERSION_1_2)
- if (useCreateImage)
- {
- cl_image_desc desc =
- {
- CL_MEM_OBJECT_IMAGE2D,
- width,
- height,
- 0, 0, // depth, array size (unused)
- row_pitch,
- 0, 0, 0, 0
- };
- object_ = ::clCreateImage(
- context(),
- flags,
- &format,
- &desc,
- host_ptr,
- &error);
-
- detail::errHandler(error, __CREATE_IMAGE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-#endif // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
- if (!useCreateImage)
- {
- object_ = ::clCreateImage2D(
- context(), flags,&format, width, height, row_pitch, host_ptr, &error);
-
- detail::errHandler(error, __CREATE_IMAGE2D_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
- }
-
- //! \brief Default constructor - initializes to NULL.
- Image2D() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image2D(const Image2D& image2D) : Image(image2D) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
-
- /*! \brief Assignment from Image2D - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image2D& operator = (const Image2D& rhs)
- {
- if (this != &rhs) {
- Image::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image2D& operator = (const cl_mem& rhs)
- {
- Image::operator=(rhs);
- return *this;
- }
-};
-
-
-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 2D Image Memory objects.
- *
- * This is provided to facilitate interoperability with OpenGL.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- * \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
- */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
-{
-public:
- /*! \brief Constructs an Image2DGL in a specified context, from a given
- * GL Texture.
- *
- * Wraps clCreateFromGLTexture2D().
- */
- Image2DGL(
- const Context& context,
- cl_mem_flags flags,
- GLenum target,
- GLint miplevel,
- GLuint texobj,
- cl_int * err = NULL)
- {
- cl_int error;
- object_ = ::clCreateFromGLTexture2D(
- context(),
- flags,
- target,
- miplevel,
- texobj,
- &error);
-
- detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
- if (err != NULL) {
- *err = error;
- }
-
- }
-
- //! \brief Default constructor - initializes to NULL.
- Image2DGL() : Image2D() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image2DGL(const Image2DGL& image) : Image2D(image) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
-
- /*! \brief Assignment from Image2DGL - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image2DGL& operator = (const Image2DGL& rhs)
- {
- if (this != &rhs) {
- Image2D::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image2DGL& operator = (const cl_mem& rhs)
- {
- Image2D::operator=(rhs);
- return *this;
- }
-};
-#endif // #if !defined(CL_VERSION_1_2)
-
-#if defined(CL_VERSION_1_2)
-/*! \class Image2DArray
- * \brief Image interface for arrays of 2D images.
- */
-class Image2DArray : public Image
-{
-public:
- Image2DArray(
- const Context& context,
- cl_mem_flags flags,
- ImageFormat format,
- ::size_t arraySize,
- ::size_t width,
- ::size_t height,
- ::size_t rowPitch,
- ::size_t slicePitch,
- void* host_ptr = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
- cl_image_desc desc =
- {
- CL_MEM_OBJECT_IMAGE2D_ARRAY,
- width,
- height,
- 0, // depth (unused)
- arraySize,
- rowPitch,
- slicePitch,
- 0, 0, 0
- };
- object_ = ::clCreateImage(
- context(),
- flags,
- &format,
- &desc,
- host_ptr,
- &error);
-
- detail::errHandler(error, __CREATE_IMAGE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- Image2DArray() { }
-
- Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
-
- __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
-
- Image2DArray& operator = (const Image2DArray& rhs)
- {
- if (this != &rhs) {
- Image::operator=(rhs);
- }
- return *this;
- }
-
- Image2DArray& operator = (const cl_mem& rhs)
- {
- Image::operator=(rhs);
- return *this;
- }
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-/*! \brief Class interface for 3D Image Memory objects.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class Image3D : public Image
-{
-public:
- /*! \brief Constructs a 3D Image in a specified context.
- *
- * Wraps clCreateImage().
- */
- Image3D(
- const Context& context,
- cl_mem_flags flags,
- ImageFormat format,
- ::size_t width,
- ::size_t height,
- ::size_t depth,
- ::size_t row_pitch = 0,
- ::size_t slice_pitch = 0,
- void* host_ptr = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
- bool useCreateImage;
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
- // Run-time decision based on the actual platform
- {
- cl_uint version = detail::getContextPlatformVersion(context());
- useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
- }
-#elif defined(CL_VERSION_1_2)
- useCreateImage = true;
-#else
- useCreateImage = false;
-#endif
-
-#if defined(CL_VERSION_1_2)
- if (useCreateImage)
- {
- cl_image_desc desc =
- {
- CL_MEM_OBJECT_IMAGE3D,
- width,
- height,
- depth,
- 0, // array size (unused)
- row_pitch,
- slice_pitch,
- 0, 0, 0
- };
- object_ = ::clCreateImage(
- context(),
- flags,
- &format,
- &desc,
- host_ptr,
- &error);
-
- detail::errHandler(error, __CREATE_IMAGE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-#endif // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
- if (!useCreateImage)
- {
- object_ = ::clCreateImage3D(
- context(), flags, &format, width, height, depth, row_pitch,
- slice_pitch, host_ptr, &error);
-
- detail::errHandler(error, __CREATE_IMAGE3D_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
- }
-
- //! \brief Default constructor - initializes to NULL.
- Image3D() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image3D(const Image3D& image3D) : Image(image3D) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
-
- /*! \brief Assignment from Image3D - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image3D& operator = (const Image3D& rhs)
- {
- if (this != &rhs) {
- Image::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image3D& operator = (const cl_mem& rhs)
- {
- Image::operator=(rhs);
- return *this;
- }
-};
-
-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 3D Image Memory objects.
- *
- * This is provided to facilitate interoperability with OpenGL.
- *
- * See Memory for details about copy semantics, etc.
- *
- * \see Memory
- */
-class Image3DGL : public Image3D
-{
-public:
- /*! \brief Constructs an Image3DGL in a specified context, from a given
- * GL Texture.
- *
- * Wraps clCreateFromGLTexture3D().
- */
- Image3DGL(
- const Context& context,
- cl_mem_flags flags,
- GLenum target,
- GLint miplevel,
- GLuint texobj,
- cl_int * err = NULL)
- {
- cl_int error;
- object_ = ::clCreateFromGLTexture3D(
- context(),
- flags,
- target,
- miplevel,
- texobj,
- &error);
-
- detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- //! \brief Default constructor - initializes to NULL.
- Image3DGL() : Image3D() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image3DGL(const Image3DGL& image) : Image3D(image) { }
-
- /*! \brief Constructor from cl_mem - takes ownership.
- *
- * See Memory for further details.
- */
- __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
-
- /*! \brief Assignment from Image3DGL - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image3DGL& operator = (const Image3DGL& rhs)
- {
- if (this != &rhs) {
- Image3D::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment from cl_mem - performs shallow copy.
- *
- * See Memory for further details.
- */
- Image3DGL& operator = (const cl_mem& rhs)
- {
- Image3D::operator=(rhs);
- return *this;
- }
-};
-#endif // #if !defined(CL_VERSION_1_2)
-
-#if defined(CL_VERSION_1_2)
-/*! \class ImageGL
- * \brief general image interface for GL interop.
- * We abstract the 2D and 3D GL images into a single instance here
- * that wraps all GL sourced images on the grounds that setup information
- * was performed by OpenCL anyway.
- */
-class ImageGL : public Image
-{
-public:
- ImageGL(
- const Context& context,
- cl_mem_flags flags,
- GLenum target,
- GLint miplevel,
- GLuint texobj,
- cl_int * err = NULL)
- {
- cl_int error;
- object_ = ::clCreateFromGLTexture(
- context(),
- flags,
- target,
- miplevel,
- texobj,
- &error);
-
- detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- ImageGL() : Image() { }
-
- ImageGL(const ImageGL& image) : Image(image) { }
-
- __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
-
- ImageGL& operator = (const ImageGL& rhs)
- {
- if (this != &rhs) {
- Image::operator=(rhs);
- }
- return *this;
- }
-
- ImageGL& operator = (const cl_mem& rhs)
- {
- Image::operator=(rhs);
- return *this;
- }
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-/*! \brief Class interface for cl_sampler.
- *
- * \note Copies of these objects are shallow, meaning that the copy will refer
- * to the same underlying cl_sampler as the original. For details, see
- * clRetainSampler() and clReleaseSampler().
- *
- * \see cl_sampler
- */
-class Sampler : public detail::Wrapper<cl_sampler>
-{
-public:
- /*! \brief Destructor.
- *
- * This calls clReleaseSampler() on the value held by this instance.
- */
- ~Sampler() { }
-
- //! \brief Default constructor - initializes to NULL.
- Sampler() { }
-
- /*! \brief Constructs a Sampler in a specified context.
- *
- * Wraps clCreateSampler().
- */
- Sampler(
- const Context& context,
- cl_bool normalized_coords,
- cl_addressing_mode addressing_mode,
- cl_filter_mode filter_mode,
- cl_int* err = NULL)
- {
- cl_int error;
- object_ = ::clCreateSampler(
- context(),
- normalized_coords,
- addressing_mode,
- filter_mode,
- &error);
-
- detail::errHandler(error, __CREATE_SAMPLER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * This calls clRetainSampler() on the parameter's cl_sampler.
- */
- Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-
- /*! \brief Constructor from cl_sampler - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the cl_sampler
- * into the new Sampler object.
- */
- Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-
- /*! \brief Assignment operator from Sampler.
- *
- * This calls clRetainSampler() on the parameter and clReleaseSampler()
- * on the previous value held by this instance.
- */
- Sampler& operator = (const Sampler& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment operator from cl_sampler - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the rhs and calls
- * clReleaseSampler() on the value previously held by this instance.
- */
- Sampler& operator = (const cl_sampler& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- //! \brief Wrapper for clGetSamplerInfo().
- template <typename T>
- cl_int getInfo(cl_sampler_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetSamplerInfo, object_, name, param),
- __GET_SAMPLER_INFO_ERR);
- }
-
- //! \brief Wrapper for clGetSamplerInfo() that returns by value.
- template <cl_int name> typename
- detail::param_traits<detail::cl_sampler_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_sampler_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-};
-
-class Program;
-class CommandQueue;
-class Kernel;
-
-//! \brief Class interface for specifying NDRange values.
-class NDRange
-{
-private:
- size_t<3> sizes_;
- cl_uint dimensions_;
-
-public:
- //! \brief Default constructor - resulting range has zero dimensions.
- NDRange()
- : dimensions_(0)
- { }
-
- //! \brief Constructs one-dimensional range.
- NDRange(::size_t size0)
- : dimensions_(1)
- {
- sizes_[0] = size0;
- }
-
- //! \brief Constructs two-dimensional range.
- NDRange(::size_t size0, ::size_t size1)
- : dimensions_(2)
- {
- sizes_[0] = size0;
- sizes_[1] = size1;
- }
-
- //! \brief Constructs three-dimensional range.
- NDRange(::size_t size0, ::size_t size1, ::size_t size2)
- : dimensions_(3)
- {
- sizes_[0] = size0;
- sizes_[1] = size1;
- sizes_[2] = size2;
- }
-
- /*! \brief Conversion operator to const ::size_t *.
- *
- * \returns a pointer to the size of the first dimension.
- */
- operator const ::size_t*() const {
- return (const ::size_t*) sizes_;
- }
-
- //! \brief Queries the number of dimensions in the range.
- ::size_t dimensions() const { return dimensions_; }
-};
-
-//! \brief A zero-dimensional range.
-static const NDRange NullRange;
-
-//! \brief Local address wrapper for use with Kernel::setArg
-struct LocalSpaceArg
-{
- ::size_t size_;
-};
-
-namespace detail {
-
-template <typename T>
-struct KernelArgumentHandler
-{
- static ::size_t size(const T&) { return sizeof(T); }
- static T* ptr(T& value) { return &value; }
-};
-
-template <>
-struct KernelArgumentHandler<LocalSpaceArg>
-{
- static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
- static void* ptr(LocalSpaceArg&) { return NULL; }
-};
-
-}
-//! \endcond
-
-/*! __local
- * \brief Helper function for generating LocalSpaceArg objects.
- * Deprecated. Replaced with Local.
- */
-inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
-__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-inline LocalSpaceArg
-__local(::size_t size)
-{
- LocalSpaceArg ret = { size };
- return ret;
-}
-
-/*! Local
- * \brief Helper function for generating LocalSpaceArg objects.
- */
-inline LocalSpaceArg
-Local(::size_t size)
-{
- LocalSpaceArg ret = { size };
- return ret;
-}
-
-//class KernelFunctor;
-
-/*! \brief Class interface for cl_kernel.
- *
- * \note Copies of these objects are shallow, meaning that the copy will refer
- * to the same underlying cl_kernel as the original. For details, see
- * clRetainKernel() and clReleaseKernel().
- *
- * \see cl_kernel
- */
-class Kernel : public detail::Wrapper<cl_kernel>
-{
-public:
- inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
-
- /*! \brief Destructor.
- *
- * This calls clReleaseKernel() on the value held by this instance.
- */
- ~Kernel() { }
-
- //! \brief Default constructor - initializes to NULL.
- Kernel() { }
-
- /*! \brief Copy constructor - performs shallow copy.
- *
- * This calls clRetainKernel() on the parameter's cl_kernel.
- */
- Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
-
- /*! \brief Constructor from cl_kernel - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the cl_kernel
- * into the new Kernel object.
- */
- __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
-
- /*! \brief Assignment operator from Kernel.
- *
- * This calls clRetainKernel() on the parameter and clReleaseKernel()
- * on the previous value held by this instance.
- */
- Kernel& operator = (const Kernel& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- /*! \brief Assignment operator from cl_kernel - takes ownership.
- *
- * This effectively transfers ownership of a refcount on the rhs and calls
- * clReleaseKernel() on the value previously held by this instance.
- */
- Kernel& operator = (const cl_kernel& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- template <typename T>
- cl_int getInfo(cl_kernel_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetKernelInfo, object_, name, param),
- __GET_KERNEL_INFO_ERR);
- }
-
- template <cl_int name> typename
- detail::param_traits<detail::cl_kernel_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_kernel_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
-#if defined(CL_VERSION_1_2)
- template <typename T>
- cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
- __GET_KERNEL_ARG_INFO_ERR);
- }
-
- template <cl_int name> typename
- detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
- getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_kernel_arg_info, name>::param_type param;
- cl_int result = getArgInfo(argIndex, name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-#endif // #if defined(CL_VERSION_1_2)
-
- template <typename T>
- cl_int getWorkGroupInfo(
- const Device& device, cl_kernel_work_group_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(
- &::clGetKernelWorkGroupInfo, object_, device(), name, param),
- __GET_KERNEL_WORK_GROUP_INFO_ERR);
- }
-
- template <cl_int name> typename
- detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
- getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_kernel_work_group_info, name>::param_type param;
- cl_int result = getWorkGroupInfo(device, name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- template <typename T>
- cl_int setArg(cl_uint index, T value)
- {
- return detail::errHandler(
- ::clSetKernelArg(
- object_,
- index,
- detail::KernelArgumentHandler<T>::size(value),
- detail::KernelArgumentHandler<T>::ptr(value)),
- __SET_KERNEL_ARGS_ERR);
- }
-
- cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
- {
- return detail::errHandler(
- ::clSetKernelArg(object_, index, size, argPtr),
- __SET_KERNEL_ARGS_ERR);
- }
-};
-
-/*! \class Program
- * \brief Program interface that implements cl_program.
- */
-class Program : public detail::Wrapper<cl_program>
-{
-public:
- typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
- typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
-
- Program(
- const STRING_CLASS& source,
- bool build = false,
- cl_int* err = NULL)
- {
- cl_int error;
-
- const char * strings = source.c_str();
- const ::size_t length = source.size();
-
- Context context = Context::getDefault(err);
-
- object_ = ::clCreateProgramWithSource(
- context(), (cl_uint)1, &strings, &length, &error);
-
- detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-
- if (error == CL_SUCCESS && build) {
-
- error = ::clBuildProgram(
- object_,
- 0,
- NULL,
- "",
- NULL,
- NULL);
-
- detail::errHandler(error, __BUILD_PROGRAM_ERR);
- }
-
- if (err != NULL) {
- *err = error;
- }
- }
-
- Program(
- const Context& context,
- const STRING_CLASS& source,
- bool build = false,
- cl_int* err = NULL)
- {
- cl_int error;
-
- const char * strings = source.c_str();
- const ::size_t length = source.size();
-
- object_ = ::clCreateProgramWithSource(
- context(), (cl_uint)1, &strings, &length, &error);
-
- detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-
- if (error == CL_SUCCESS && build) {
-
- error = ::clBuildProgram(
- object_,
- 0,
- NULL,
- "",
- NULL,
- NULL);
-
- detail::errHandler(error, __BUILD_PROGRAM_ERR);
- }
-
- if (err != NULL) {
- *err = error;
- }
- }
-
- Program(
- const Context& context,
- const Sources& sources,
- cl_int* err = NULL)
- {
- cl_int error;
-
- const ::size_t n = (::size_t)sources.size();
- ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
- const char** strings = (const char**) alloca(n * sizeof(const char*));
-
- for (::size_t i = 0; i < n; ++i) {
- strings[i] = sources[(int)i].first;
- lengths[i] = sources[(int)i].second;
- }
-
- object_ = ::clCreateProgramWithSource(
- context(), (cl_uint)n, strings, lengths, &error);
-
- detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- /**
- * Construct a program object from a list of devices and a per-device list of binaries.
- * \param context A valid OpenCL context in which to construct the program.
- * \param devices A vector of OpenCL device objects for which the program will be created.
- * \param binaries A vector of pairs of a pointer to a binary object and its length.
- * \param binaryStatus An optional vector that on completion will be resized to
- * match the size of binaries and filled with values to specify if each binary
- * was successfully loaded.
- * Set to CL_SUCCESS if the binary was successfully loaded.
- * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
- * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
- * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
- * CL_INVALID_CONTEXT if context is not a valid context.
- * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices;
- * or if any entry in binaries is NULL or has length 0.
- * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
- * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
- * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
- */
- Program(
- const Context& context,
- const VECTOR_CLASS<Device>& devices,
- const Binaries& binaries,
- VECTOR_CLASS<cl_int>* binaryStatus = NULL,
- cl_int* err = NULL)
- {
- cl_int error;
-
- const ::size_t numDevices = devices.size();
-
- // Catch size mismatch early and return
- if(binaries.size() != numDevices) {
- error = CL_INVALID_VALUE;
- detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
- if (err != NULL) {
- *err = error;
- }
- return;
- }
-
- ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
- const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
-
- for (::size_t i = 0; i < numDevices; ++i) {
- images[i] = (const unsigned char*)binaries[i].first;
- lengths[i] = binaries[(int)i].second;
- }
-
- cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
- for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
- deviceIDs[deviceIndex] = (devices[deviceIndex])();
- }
-
- if(binaryStatus) {
- binaryStatus->resize(numDevices);
- }
-
- object_ = ::clCreateProgramWithBinary(
- context(), (cl_uint) devices.size(),
- deviceIDs,
- lengths, images, binaryStatus != NULL
- ? &binaryStatus->front()
- : NULL, &error);
-
- detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
-
-#if defined(CL_VERSION_1_2)
- /**
- * Create program using builtin kernels.
- * \param kernelNames Semi-colon separated list of builtin kernel names
- */
- Program(
- const Context& context,
- const VECTOR_CLASS<Device>& devices,
- const STRING_CLASS& kernelNames,
- cl_int* err = NULL)
- {
- cl_int error;
-
-
- ::size_t numDevices = devices.size();
- cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
- for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
- deviceIDs[deviceIndex] = (devices[deviceIndex])();
- }
-
- object_ = ::clCreateProgramWithBuiltInKernels(
- context(),
- (cl_uint) devices.size(),
- deviceIDs,
- kernelNames.c_str(),
- &error);
-
- detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-#endif // #if defined(CL_VERSION_1_2)
-
- Program() { }
-
- Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
-
- __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
-
- Program& operator = (const Program& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- Program& operator = (const cl_program& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- cl_int build(
- const VECTOR_CLASS<Device>& devices,
- const char* options = NULL,
- void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
- void* data = NULL) const
- {
- ::size_t numDevices = devices.size();
- cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
- for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
- deviceIDs[deviceIndex] = (devices[deviceIndex])();
- }
-
- return detail::errHandler(
- ::clBuildProgram(
- object_,
- (cl_uint)
- devices.size(),
- deviceIDs,
- options,
- notifyFptr,
- data),
- __BUILD_PROGRAM_ERR);
- }
-
- cl_int build(
- const char* options = NULL,
- void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
- void* data = NULL) const
- {
- return detail::errHandler(
- ::clBuildProgram(
- object_,
- 0,
- NULL,
- options,
- notifyFptr,
- data),
- __BUILD_PROGRAM_ERR);
- }
-
-#if defined(CL_VERSION_1_2)
- cl_int compile(
- const char* options = NULL,
- void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
- void* data = NULL) const
- {
- return detail::errHandler(
- ::clCompileProgram(
- object_,
- 0,
- NULL,
- options,
- 0,
- NULL,
- NULL,
- notifyFptr,
- data),
- __COMPILE_PROGRAM_ERR);
- }
-#endif
-
- template <typename T>
- cl_int getInfo(cl_program_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(&::clGetProgramInfo, object_, name, param),
- __GET_PROGRAM_INFO_ERR);
- }
-
- template <cl_int name> typename
- detail::param_traits<detail::cl_program_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_program_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- template <typename T>
- cl_int getBuildInfo(
- const Device& device, cl_program_build_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(
- &::clGetProgramBuildInfo, object_, device(), name, param),
- __GET_PROGRAM_BUILD_INFO_ERR);
- }
-
- template <cl_int name> typename
- detail::param_traits<detail::cl_program_build_info, name>::param_type
- getBuildInfo(const Device& device, cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_program_build_info, name>::param_type param;
- cl_int result = getBuildInfo(device, name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
- {
- cl_uint numKernels;
- cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
- }
-
- Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
- err = ::clCreateKernelsInProgram(
- object_, numKernels, (cl_kernel*) value, NULL);
- if (err != CL_SUCCESS) {
- return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
- }
-
- kernels->assign(&value[0], &value[numKernels]);
- return CL_SUCCESS;
- }
-};
-
-#if defined(CL_VERSION_1_2)
-inline Program linkProgram(
- Program input1,
- Program input2,
- const char* options = NULL,
- void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
- void* data = NULL,
- cl_int* err = NULL)
-{
- cl_int err_local = CL_SUCCESS;
-
- cl_program programs[2] = { input1(), input2() };
-
- Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
-
- cl_program prog = ::clLinkProgram(
- ctx(),
- 0,
- NULL,
- options,
- 2,
- programs,
- notifyFptr,
- data,
- &err_local);
-
- detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
- if (err != NULL) {
- *err = err_local;
- }
-
- return Program(prog);
-}
-
-inline Program linkProgram(
- VECTOR_CLASS<Program> inputPrograms,
- const char* options = NULL,
- void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
- void* data = NULL,
- cl_int* err = NULL)
-{
- cl_int err_local = CL_SUCCESS;
-
- cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
-
- if (programs != NULL) {
- for (unsigned int i = 0; i < inputPrograms.size(); i++) {
- programs[i] = inputPrograms[i]();
- }
- }
-
- cl_program prog = ::clLinkProgram(
- Context::getDefault()(),
- 0,
- NULL,
- options,
- (cl_uint)inputPrograms.size(),
- programs,
- notifyFptr,
- data,
- &err_local);
-
- detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
- if (err != NULL) {
- *err = err_local;
- }
-
- return Program(prog);
-}
-#endif
-
-template<>
-inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
-{
- VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
- VECTOR_CLASS<char *> binaries;
- for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s)
- {
- char *ptr = NULL;
- if (*s != 0)
- ptr = new char[*s];
- binaries.push_back(ptr);
- }
-
- cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
- if (err != NULL) {
- *err = result;
- }
- return binaries;
-}
-
-inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
-{
- cl_int error;
-
- object_ = ::clCreateKernel(program(), name, &error);
- detail::errHandler(error, __CREATE_KERNEL_ERR);
-
- if (err != NULL) {
- *err = error;
- }
-
-}
-
-/*! \class CommandQueue
- * \brief CommandQueue interface for cl_command_queue.
- */
-class CommandQueue : public detail::Wrapper<cl_command_queue>
-{
-private:
- static volatile int default_initialized_;
- static CommandQueue default_;
- static volatile cl_int default_error_;
-public:
- CommandQueue(
- cl_command_queue_properties properties,
- cl_int* err = NULL)
- {
- cl_int error;
-
- Context context = Context::getDefault(&error);
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
- if (error != CL_SUCCESS) {
- if (err != NULL) {
- *err = error;
- }
- }
- else {
- Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-
- object_ = ::clCreateCommandQueue(
- context(), device(), properties, &error);
-
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
- }
- /*!
- * \brief Constructs a CommandQueue for an implementation defined device in the given context
- */
- explicit CommandQueue(
- const Context& context,
- cl_command_queue_properties properties = 0,
- cl_int* err = NULL)
- {
- cl_int error;
- VECTOR_CLASS<cl::Device> devices;
- error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
-
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
- if (error != CL_SUCCESS)
- {
- if (err != NULL) {
- *err = error;
- }
- return;
- }
-
- object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
-
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
- if (err != NULL) {
- *err = error;
- }
-
- }
-
- CommandQueue(
- const Context& context,
- const Device& device,
- cl_command_queue_properties properties = 0,
- cl_int* err = NULL)
- {
- cl_int error;
- object_ = ::clCreateCommandQueue(
- context(), device(), properties, &error);
-
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- static CommandQueue getDefault(cl_int * err = NULL)
- {
- int state = detail::compare_exchange(
- &default_initialized_,
- __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
-
- if (state & __DEFAULT_INITIALIZED) {
- if (err != NULL) {
- *err = default_error_;
- }
- return default_;
- }
-
- if (state & __DEFAULT_BEING_INITIALIZED) {
- // Assume writes will propagate eventually...
- while(default_initialized_ != __DEFAULT_INITIALIZED) {
- detail::fence();
- }
-
- if (err != NULL) {
- *err = default_error_;
- }
- return default_;
- }
-
- cl_int error;
-
- Context context = Context::getDefault(&error);
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
- if (error != CL_SUCCESS) {
- if (err != NULL) {
- *err = error;
- }
- }
- else {
- Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-
- default_ = CommandQueue(context, device, 0, &error);
-
- detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-
- detail::fence();
-
- default_error_ = error;
- // Assume writes will propagate eventually...
- default_initialized_ = __DEFAULT_INITIALIZED;
-
- detail::fence();
-
- if (err != NULL) {
- *err = default_error_;
- }
- return default_;
-
- }
-
- CommandQueue() { }
-
- CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
-
- CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
-
- CommandQueue& operator = (const CommandQueue& rhs)
- {
- if (this != &rhs) {
- detail::Wrapper<cl_type>::operator=(rhs);
- }
- return *this;
- }
-
- CommandQueue& operator = (const cl_command_queue& rhs)
- {
- detail::Wrapper<cl_type>::operator=(rhs);
- return *this;
- }
-
- template <typename T>
- cl_int getInfo(cl_command_queue_info name, T* param) const
- {
- return detail::errHandler(
- detail::getInfo(
- &::clGetCommandQueueInfo, object_, name, param),
- __GET_COMMAND_QUEUE_INFO_ERR);
- }
-
- template <cl_int name> typename
- detail::param_traits<detail::cl_command_queue_info, name>::param_type
- getInfo(cl_int* err = NULL) const
- {
- typename detail::param_traits<
- detail::cl_command_queue_info, name>::param_type param;
- cl_int result = getInfo(name, ¶m);
- if (err != NULL) {
- *err = result;
- }
- return param;
- }
-
- cl_int enqueueReadBuffer(
- const Buffer& buffer,
- cl_bool blocking,
- ::size_t offset,
- ::size_t size,
- void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueReadBuffer(
- object_, buffer(), blocking, offset, size,
- ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_READ_BUFFER_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueWriteBuffer(
- const Buffer& buffer,
- cl_bool blocking,
- ::size_t offset,
- ::size_t size,
- const void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueWriteBuffer(
- object_, buffer(), blocking, offset, size,
- ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_WRITE_BUFFER_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueCopyBuffer(
- const Buffer& src,
- const Buffer& dst,
- ::size_t src_offset,
- ::size_t dst_offset,
- ::size_t size,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueCopyBuffer(
- object_, src(), dst(), src_offset, dst_offset, size,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQEUE_COPY_BUFFER_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueReadBufferRect(
- const Buffer& buffer,
- cl_bool blocking,
- const size_t<3>& buffer_offset,
- const size_t<3>& host_offset,
- const size_t<3>& region,
- ::size_t buffer_row_pitch,
- ::size_t buffer_slice_pitch,
- ::size_t host_row_pitch,
- ::size_t host_slice_pitch,
- void *ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueReadBufferRect(
- object_,
- buffer(),
- blocking,
- (const ::size_t *)buffer_offset,
- (const ::size_t *)host_offset,
- (const ::size_t *)region,
- buffer_row_pitch,
- buffer_slice_pitch,
- host_row_pitch,
- host_slice_pitch,
- ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_READ_BUFFER_RECT_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueWriteBufferRect(
- const Buffer& buffer,
- cl_bool blocking,
- const size_t<3>& buffer_offset,
- const size_t<3>& host_offset,
- const size_t<3>& region,
- ::size_t buffer_row_pitch,
- ::size_t buffer_slice_pitch,
- ::size_t host_row_pitch,
- ::size_t host_slice_pitch,
- void *ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueWriteBufferRect(
- object_,
- buffer(),
- blocking,
- (const ::size_t *)buffer_offset,
- (const ::size_t *)host_offset,
- (const ::size_t *)region,
- buffer_row_pitch,
- buffer_slice_pitch,
- host_row_pitch,
- host_slice_pitch,
- ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_WRITE_BUFFER_RECT_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueCopyBufferRect(
- const Buffer& src,
- const Buffer& dst,
- const size_t<3>& src_origin,
- const size_t<3>& dst_origin,
- const size_t<3>& region,
- ::size_t src_row_pitch,
- ::size_t src_slice_pitch,
- ::size_t dst_row_pitch,
- ::size_t dst_slice_pitch,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueCopyBufferRect(
- object_,
- src(),
- dst(),
- (const ::size_t *)src_origin,
- (const ::size_t *)dst_origin,
- (const ::size_t *)region,
- src_row_pitch,
- src_slice_pitch,
- dst_row_pitch,
- dst_slice_pitch,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQEUE_COPY_BUFFER_RECT_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
-#if defined(CL_VERSION_1_2)
- /**
- * Enqueue a command to fill a buffer object with a pattern
- * of a given size. The pattern is specified a as vector.
- * \tparam PatternType The datatype of the pattern field.
- * The pattern type must be an accepted OpenCL data type.
- */
- template<typename PatternType>
- cl_int enqueueFillBuffer(
- const Buffer& buffer,
- PatternType pattern,
- ::size_t offset,
- ::size_t size,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueFillBuffer(
- object_,
- buffer(),
- static_cast<void*>(&pattern),
- sizeof(PatternType),
- offset,
- size,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_FILL_BUFFER_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-#endif // #if defined(CL_VERSION_1_2)
-
- cl_int enqueueReadImage(
- const Image& image,
- cl_bool blocking,
- const size_t<3>& origin,
- const size_t<3>& region,
- ::size_t row_pitch,
- ::size_t slice_pitch,
- void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueReadImage(
- object_, image(), blocking, (const ::size_t *) origin,
- (const ::size_t *) region, row_pitch, slice_pitch, ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_READ_IMAGE_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueWriteImage(
- const Image& image,
- cl_bool blocking,
- const size_t<3>& origin,
- const size_t<3>& region,
- ::size_t row_pitch,
- ::size_t slice_pitch,
- void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueWriteImage(
- object_, image(), blocking, (const ::size_t *) origin,
- (const ::size_t *) region, row_pitch, slice_pitch, ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_WRITE_IMAGE_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueCopyImage(
- const Image& src,
- const Image& dst,
- const size_t<3>& src_origin,
- const size_t<3>& dst_origin,
- const size_t<3>& region,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueCopyImage(
- object_, src(), dst(), (const ::size_t *) src_origin,
- (const ::size_t *)dst_origin, (const ::size_t *) region,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_COPY_IMAGE_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
-#if defined(CL_VERSION_1_2)
- /**
- * Enqueue a command to fill an image object with a specified color.
- * \param fillColor is the color to use to fill the image.
- * This is a four component RGBA floating-point color value if
- * the image channel data type is not an unnormalized signed or
- * unsigned data type.
- */
- cl_int enqueueFillImage(
- const Image& image,
- cl_float4 fillColor,
- const size_t<3>& origin,
- const size_t<3>& region,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueFillImage(
- object_,
- image(),
- static_cast<void*>(&fillColor),
- (const ::size_t *) origin,
- (const ::size_t *) region,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_FILL_IMAGE_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- /**
- * Enqueue a command to fill an image object with a specified color.
- * \param fillColor is the color to use to fill the image.
- * This is a four component RGBA signed integer color value if
- * the image channel data type is an unnormalized signed integer
- * type.
- */
- cl_int enqueueFillImage(
- const Image& image,
- cl_int4 fillColor,
- const size_t<3>& origin,
- const size_t<3>& region,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueFillImage(
- object_,
- image(),
- static_cast<void*>(&fillColor),
- (const ::size_t *) origin,
- (const ::size_t *) region,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_FILL_IMAGE_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- /**
- * Enqueue a command to fill an image object with a specified color.
- * \param fillColor is the color to use to fill the image.
- * This is a four component RGBA unsigned integer color value if
- * the image channel data type is an unnormalized unsigned integer
- * type.
- */
- cl_int enqueueFillImage(
- const Image& image,
- cl_uint4 fillColor,
- const size_t<3>& origin,
- const size_t<3>& region,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueFillImage(
- object_,
- image(),
- static_cast<void*>(&fillColor),
- (const ::size_t *) origin,
- (const ::size_t *) region,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_FILL_IMAGE_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-#endif // #if defined(CL_VERSION_1_2)
-
- cl_int enqueueCopyImageToBuffer(
- const Image& src,
- const Buffer& dst,
- const size_t<3>& src_origin,
- const size_t<3>& region,
- ::size_t dst_offset,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueCopyImageToBuffer(
- object_, src(), dst(), (const ::size_t *) src_origin,
- (const ::size_t *) region, dst_offset,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueCopyBufferToImage(
- const Buffer& src,
- const Image& dst,
- ::size_t src_offset,
- const size_t<3>& dst_origin,
- const size_t<3>& region,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueCopyBufferToImage(
- object_, src(), dst(), src_offset,
- (const ::size_t *) dst_origin, (const ::size_t *) region,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- void* enqueueMapBuffer(
- const Buffer& buffer,
- cl_bool blocking,
- cl_map_flags flags,
- ::size_t offset,
- ::size_t size,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL,
- cl_int* err = NULL) const
- {
- cl_int error;
- void * result = ::clEnqueueMapBuffer(
- object_, buffer(), blocking, flags, offset, size,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (cl_event*) event,
- &error);
-
- detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- return result;
- }
-
- void* enqueueMapImage(
- const Image& buffer,
- cl_bool blocking,
- cl_map_flags flags,
- const size_t<3>& origin,
- const size_t<3>& region,
- ::size_t * row_pitch,
- ::size_t * slice_pitch,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL,
- cl_int* err = NULL) const
- {
- cl_int error;
- void * result = ::clEnqueueMapImage(
- object_, buffer(), blocking, flags,
- (const ::size_t *) origin, (const ::size_t *) region,
- row_pitch, slice_pitch,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (cl_event*) event,
- &error);
-
- detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
- if (err != NULL) {
- *err = error;
- }
- return result;
- }
-
- cl_int enqueueUnmapMemObject(
- const Memory& memory,
- void* mapped_ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueUnmapMemObject(
- object_, memory(), mapped_ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
-#if defined(CL_VERSION_1_2)
- /**
- * Enqueues a marker command which waits for either a list of events to complete,
- * or all previously enqueued commands to complete.
- *
- * Enqueues a marker command which waits for either a list of events to complete,
- * or if the list is empty it waits for all commands previously enqueued in command_queue
- * to complete before it completes. This command returns an event which can be waited on,
- * i.e. this event can be waited on to insure that all events either in the event_wait_list
- * or all previously enqueued commands, queued before this command to command_queue,
- * have completed.
- */
- cl_int enqueueMarkerWithWaitList(
- const VECTOR_CLASS<Event> *events = 0,
- Event *event = 0)
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueMarkerWithWaitList(
- object_,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_MARKER_WAIT_LIST_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- /**
- * A synchronization point that enqueues a barrier operation.
- *
- * Enqueues a barrier command which waits for either a list of events to complete,
- * or if the list is empty it waits for all commands previously enqueued in command_queue
- * to complete before it completes. This command blocks command execution, that is, any
- * following commands enqueued after it do not execute until it completes. This command
- * returns an event which can be waited on, i.e. this event can be waited on to insure that
- * all events either in the event_wait_list or all previously enqueued commands, queued
- * before this command to command_queue, have completed.
- */
- cl_int enqueueBarrierWithWaitList(
- const VECTOR_CLASS<Event> *events = 0,
- Event *event = 0)
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueBarrierWithWaitList(
- object_,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_BARRIER_WAIT_LIST_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- /**
- * Enqueues a command to indicate with which device a set of memory objects
- * should be associated.
- */
- cl_int enqueueMigrateMemObjects(
- const VECTOR_CLASS<Memory> &memObjects,
- cl_mem_migration_flags flags,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL
- )
- {
- cl_event tmp;
-
- cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
- for( int i = 0; i < (int)memObjects.size(); ++i ) {
- localMemObjects[i] = memObjects[i]();
- }
-
-
- cl_int err = detail::errHandler(
- ::clEnqueueMigrateMemObjects(
- object_,
- (cl_uint)memObjects.size(),
- static_cast<const cl_mem*>(localMemObjects),
- flags,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-#endif // #if defined(CL_VERSION_1_2)
-
- cl_int enqueueNDRangeKernel(
- const Kernel& kernel,
- const NDRange& offset,
- const NDRange& global,
- const NDRange& local = NullRange,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueNDRangeKernel(
- object_, kernel(), (cl_uint) global.dimensions(),
- offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
- (const ::size_t*) global,
- local.dimensions() != 0 ? (const ::size_t*) local : NULL,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_NDRANGE_KERNEL_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueTask(
- const Kernel& kernel,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueTask(
- object_, kernel(),
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_TASK_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueNativeKernel(
- void (CL_CALLBACK *userFptr)(void *),
- std::pair<void*, ::size_t> args,
- const VECTOR_CLASS<Memory>* mem_objects = NULL,
- const VECTOR_CLASS<const void*>* mem_locs = NULL,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0)
- ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
- : NULL;
-
- if (mems != NULL) {
- for (unsigned int i = 0; i < mem_objects->size(); i++) {
- mems[i] = ((*mem_objects)[i])();
- }
- }
-
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueNativeKernel(
- object_, userFptr, args.first, args.second,
- (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
- mems,
- (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_NATIVE_KERNEL);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
- CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
- cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
- {
- return detail::errHandler(
- ::clEnqueueMarker(object_, (cl_event*) event),
- __ENQUEUE_MARKER_ERR);
- }
-
- CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
- cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
- {
- return detail::errHandler(
- ::clEnqueueWaitForEvents(
- object_,
- (cl_uint) events.size(),
- (const cl_event*) &events.front()),
- __ENQUEUE_WAIT_FOR_EVENTS_ERR);
- }
-#endif // #if defined(CL_VERSION_1_1)
-
- cl_int enqueueAcquireGLObjects(
- const VECTOR_CLASS<Memory>* mem_objects = NULL,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueAcquireGLObjects(
- object_,
- (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
- (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_ACQUIRE_GL_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueReleaseGLObjects(
- const VECTOR_CLASS<Memory>* mem_objects = NULL,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueReleaseGLObjects(
- object_,
- (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
- (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_RELEASE_GL_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
-#if defined (USE_DX_INTEROP)
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
- cl_command_queue command_queue, cl_uint num_objects,
- const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
- const cl_event* event_wait_list, cl_event* event);
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
- cl_command_queue command_queue, cl_uint num_objects,
- const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
- const cl_event* event_wait_list, cl_event* event);
-
- cl_int enqueueAcquireD3D10Objects(
- const VECTOR_CLASS<Memory>* mem_objects = NULL,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
- cl_context context = getInfo<CL_QUEUE_CONTEXT>();
- cl::Device device(getInfo<CL_QUEUE_DEVICE>());
- cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
- __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-#if defined(CL_VERSION_1_1)
- __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-
- cl_event tmp;
- cl_int err = detail::errHandler(
- pfn_clEnqueueAcquireD3D10ObjectsKHR(
- object_,
- (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
- (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_ACQUIRE_GL_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-
- cl_int enqueueReleaseD3D10Objects(
- const VECTOR_CLASS<Memory>* mem_objects = NULL,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL) const
- {
- static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
- cl_context context = getInfo<CL_QUEUE_CONTEXT>();
- cl::Device device(getInfo<CL_QUEUE_DEVICE>());
- cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
- __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_2)
-#if defined(CL_VERSION_1_1)
- __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_1)
-
- cl_event tmp;
- cl_int err = detail::errHandler(
- pfn_clEnqueueReleaseD3D10ObjectsKHR(
- object_,
- (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
- (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_RELEASE_GL_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
- }
-#endif
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
- CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
- cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
- {
- return detail::errHandler(
- ::clEnqueueBarrier(object_),
- __ENQUEUE_BARRIER_ERR);
- }
-#endif // #if defined(CL_VERSION_1_1)
-
- cl_int flush() const
- {
- return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
- }
-
- cl_int finish() const
- {
- return detail::errHandler(::clFinish(object_), __FINISH_ERR);
- }
-};
-
-#ifdef _WIN32
-__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__declspec(selectany) CommandQueue CommandQueue::default_;
-__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-#else
-__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__attribute__((weak)) CommandQueue CommandQueue::default_;
-__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-#endif
-
-template< typename IteratorType >
-Buffer::Buffer(
- const Context &context,
- IteratorType startIterator,
- IteratorType endIterator,
- bool readOnly,
- bool useHostPtr,
- cl_int* err)
-{
- typedef typename std::iterator_traits<IteratorType>::value_type DataType;
- cl_int error;
-
- cl_mem_flags flags = 0;
- if( readOnly ) {
- flags |= CL_MEM_READ_ONLY;
- }
- else {
- flags |= CL_MEM_READ_WRITE;
- }
- if( useHostPtr ) {
- flags |= CL_MEM_USE_HOST_PTR;
- }
-
- ::size_t size = sizeof(DataType)*(endIterator - startIterator);
-
- if( useHostPtr ) {
- object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
- } else {
- object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
- }
-
- detail::errHandler(error, __CREATE_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
-
- if( !useHostPtr ) {
- CommandQueue queue(context, 0, &error);
- detail::errHandler(error, __CREATE_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
-
- error = cl::copy(queue, startIterator, endIterator, *this);
- detail::errHandler(error, __CREATE_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- }
-}
-
-inline cl_int enqueueReadBuffer(
- const Buffer& buffer,
- cl_bool blocking,
- ::size_t offset,
- ::size_t size,
- void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
-}
-
-inline cl_int enqueueWriteBuffer(
- const Buffer& buffer,
- cl_bool blocking,
- ::size_t offset,
- ::size_t size,
- const void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
-}
-
-inline void* enqueueMapBuffer(
- const Buffer& buffer,
- cl_bool blocking,
- cl_map_flags flags,
- ::size_t offset,
- ::size_t size,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL,
- cl_int* err = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
- detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
-
- void * result = ::clEnqueueMapBuffer(
- queue(), buffer(), blocking, flags, offset, size,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (cl_event*) event,
- &error);
-
- detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
- if (err != NULL) {
- *err = error;
- }
- return result;
-}
-
-inline cl_int enqueueUnmapMemObject(
- const Memory& memory,
- void* mapped_ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
- detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
- if (error != CL_SUCCESS) {
- return error;
- }
-
- cl_event tmp;
- cl_int err = detail::errHandler(
- ::clEnqueueUnmapMemObject(
- queue(), memory(), mapped_ptr,
- (events != NULL) ? (cl_uint) events->size() : 0,
- (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
- (event != NULL) ? &tmp : NULL),
- __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
- if (event != NULL && err == CL_SUCCESS)
- *event = tmp;
-
- return err;
-}
-
-inline cl_int enqueueCopyBuffer(
- const Buffer& src,
- const Buffer& dst,
- ::size_t src_offset,
- ::size_t dst_offset,
- ::size_t size,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Host to Device.
- * Uses default command queue.
- */
-template< typename IteratorType >
-inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
- if (error != CL_SUCCESS)
- return error;
-
- return cl::copy(queue, startIterator, endIterator, buffer);
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Device to Host.
- * Uses default command queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
- if (error != CL_SUCCESS)
- return error;
-
- return cl::copy(queue, buffer, startIterator, endIterator);
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Host to Device.
- * Uses specified queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
-{
- typedef typename std::iterator_traits<IteratorType>::value_type DataType;
- cl_int error;
-
- ::size_t length = endIterator-startIterator;
- ::size_t byteLength = length*sizeof(DataType);
-
- DataType *pointer =
- static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
- // if exceptions enabled, enqueueMapBuffer will throw
- if( error != CL_SUCCESS ) {
- return error;
- }
-#if defined(_MSC_VER)
- std::copy(
- startIterator,
- endIterator,
- stdext::checked_array_iterator<DataType*>(
- pointer, length));
-#else
- std::copy(startIterator, endIterator, pointer);
-#endif
- Event endEvent;
- error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
- // if exceptions enabled, enqueueUnmapMemObject will throw
- if( error != CL_SUCCESS ) {
- return error;
- }
- endEvent.wait();
- return CL_SUCCESS;
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Device to Host.
- * Uses specified queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
-{
- typedef typename std::iterator_traits<IteratorType>::value_type DataType;
- cl_int error;
-
- ::size_t length = endIterator-startIterator;
- ::size_t byteLength = length*sizeof(DataType);
-
- DataType *pointer =
- static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
- // if exceptions enabled, enqueueMapBuffer will throw
- if( error != CL_SUCCESS ) {
- return error;
- }
- std::copy(pointer, pointer + length, startIterator);
- Event endEvent;
- error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
- // if exceptions enabled, enqueueUnmapMemObject will throw
- if( error != CL_SUCCESS ) {
- return error;
- }
- endEvent.wait();
- return CL_SUCCESS;
-}
-
-#if defined(CL_VERSION_1_1)
-inline cl_int enqueueReadBufferRect(
- const Buffer& buffer,
- cl_bool blocking,
- const size_t<3>& buffer_offset,
- const size_t<3>& host_offset,
- const size_t<3>& region,
- ::size_t buffer_row_pitch,
- ::size_t buffer_slice_pitch,
- ::size_t host_row_pitch,
- ::size_t host_slice_pitch,
- void *ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueReadBufferRect(
- buffer,
- blocking,
- buffer_offset,
- host_offset,
- region,
- buffer_row_pitch,
- buffer_slice_pitch,
- host_row_pitch,
- host_slice_pitch,
- ptr,
- events,
- event);
-}
-
-inline cl_int enqueueWriteBufferRect(
- const Buffer& buffer,
- cl_bool blocking,
- const size_t<3>& buffer_offset,
- const size_t<3>& host_offset,
- const size_t<3>& region,
- ::size_t buffer_row_pitch,
- ::size_t buffer_slice_pitch,
- ::size_t host_row_pitch,
- ::size_t host_slice_pitch,
- void *ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueWriteBufferRect(
- buffer,
- blocking,
- buffer_offset,
- host_offset,
- region,
- buffer_row_pitch,
- buffer_slice_pitch,
- host_row_pitch,
- host_slice_pitch,
- ptr,
- events,
- event);
-}
-
-inline cl_int enqueueCopyBufferRect(
- const Buffer& src,
- const Buffer& dst,
- const size_t<3>& src_origin,
- const size_t<3>& dst_origin,
- const size_t<3>& region,
- ::size_t src_row_pitch,
- ::size_t src_slice_pitch,
- ::size_t dst_row_pitch,
- ::size_t dst_slice_pitch,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueCopyBufferRect(
- src,
- dst,
- src_origin,
- dst_origin,
- region,
- src_row_pitch,
- src_slice_pitch,
- dst_row_pitch,
- dst_slice_pitch,
- events,
- event);
-}
-#endif
-
-inline cl_int enqueueReadImage(
- const Image& image,
- cl_bool blocking,
- const size_t<3>& origin,
- const size_t<3>& region,
- ::size_t row_pitch,
- ::size_t slice_pitch,
- void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueReadImage(
- image,
- blocking,
- origin,
- region,
- row_pitch,
- slice_pitch,
- ptr,
- events,
- event);
-}
-
-inline cl_int enqueueWriteImage(
- const Image& image,
- cl_bool blocking,
- const size_t<3>& origin,
- const size_t<3>& region,
- ::size_t row_pitch,
- ::size_t slice_pitch,
- void* ptr,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueWriteImage(
- image,
- blocking,
- origin,
- region,
- row_pitch,
- slice_pitch,
- ptr,
- events,
- event);
-}
-
-inline cl_int enqueueCopyImage(
- const Image& src,
- const Image& dst,
- const size_t<3>& src_origin,
- const size_t<3>& dst_origin,
- const size_t<3>& region,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueCopyImage(
- src,
- dst,
- src_origin,
- dst_origin,
- region,
- events,
- event);
-}
-
-inline cl_int enqueueCopyImageToBuffer(
- const Image& src,
- const Buffer& dst,
- const size_t<3>& src_origin,
- const size_t<3>& region,
- ::size_t dst_offset,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueCopyImageToBuffer(
- src,
- dst,
- src_origin,
- region,
- dst_offset,
- events,
- event);
-}
-
-inline cl_int enqueueCopyBufferToImage(
- const Buffer& src,
- const Image& dst,
- ::size_t src_offset,
- const size_t<3>& dst_origin,
- const size_t<3>& region,
- const VECTOR_CLASS<Event>* events = NULL,
- Event* event = NULL)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.enqueueCopyBufferToImage(
- src,
- dst,
- src_offset,
- dst_origin,
- region,
- events,
- event);
-}
-
-
-inline cl_int flush(void)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
- return queue.flush();
-}
-
-inline cl_int finish(void)
-{
- cl_int error;
- CommandQueue queue = CommandQueue::getDefault(&error);
-
- if (error != CL_SUCCESS) {
- return error;
- }
-
-
- return queue.finish();
-}
-
-// Kernel Functor support
-// New interface as of September 2011
-// Requires the C++11 std::tr1::function (note do not support TR1)
-// Visual Studio 2010 and GCC 4.2
-
-struct EnqueueArgs
-{
- CommandQueue queue_;
- const NDRange offset_;
- const NDRange global_;
- const NDRange local_;
- VECTOR_CLASS<Event> events_;
-
- EnqueueArgs(NDRange global) :
- queue_(CommandQueue::getDefault()),
- offset_(NullRange),
- global_(global),
- local_(NullRange)
- {
-
- }
-
- EnqueueArgs(NDRange global, NDRange local) :
- queue_(CommandQueue::getDefault()),
- offset_(NullRange),
- global_(global),
- local_(local)
- {
-
- }
-
- EnqueueArgs(NDRange offset, NDRange global, NDRange local) :
- queue_(CommandQueue::getDefault()),
- offset_(offset),
- global_(global),
- local_(local)
- {
-
- }
-
- EnqueueArgs(Event e, NDRange global) :
- queue_(CommandQueue::getDefault()),
- offset_(NullRange),
- global_(global),
- local_(NullRange)
- {
- events_.push_back(e);
- }
-
- EnqueueArgs(Event e, NDRange global, NDRange local) :
- queue_(CommandQueue::getDefault()),
- offset_(NullRange),
- global_(global),
- local_(local)
- {
- events_.push_back(e);
- }
-
- EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) :
- queue_(CommandQueue::getDefault()),
- offset_(offset),
- global_(global),
- local_(local)
- {
- events_.push_back(e);
- }
-
- EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) :
- queue_(CommandQueue::getDefault()),
- offset_(NullRange),
- global_(global),
- local_(NullRange),
- events_(events)
- {
-
- }
-
- EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
- queue_(CommandQueue::getDefault()),
- offset_(NullRange),
- global_(global),
- local_(local),
- events_(events)
- {
-
- }
-
- EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
- queue_(CommandQueue::getDefault()),
- offset_(offset),
- global_(global),
- local_(local),
- events_(events)
- {
-
- }
-
- EnqueueArgs(CommandQueue &queue, NDRange global) :
- queue_(queue),
- offset_(NullRange),
- global_(global),
- local_(NullRange)
- {
-
- }
-
- EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) :
- queue_(queue),
- offset_(NullRange),
- global_(global),
- local_(local)
- {
-
- }
-
- EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) :
- queue_(queue),
- offset_(offset),
- global_(global),
- local_(local)
- {
-
- }
-
- EnqueueArgs(CommandQueue &queue, Event e, NDRange global) :
- queue_(queue),
- offset_(NullRange),
- global_(global),
- local_(NullRange)
- {
- events_.push_back(e);
- }
-
- EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) :
- queue_(queue),
- offset_(NullRange),
- global_(global),
- local_(local)
- {
- events_.push_back(e);
- }
-
- EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) :
- queue_(queue),
- offset_(offset),
- global_(global),
- local_(local)
- {
- events_.push_back(e);
- }
-
- EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) :
- queue_(queue),
- offset_(NullRange),
- global_(global),
- local_(NullRange),
- events_(events)
- {
-
- }
-
- EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
- queue_(queue),
- offset_(NullRange),
- global_(global),
- local_(local),
- events_(events)
- {
-
- }
-
- EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
- queue_(queue),
- offset_(offset),
- global_(global),
- local_(local),
- events_(events)
- {
-
- }
-};
-
-namespace detail {
-
-class NullType {};
-
-template<int index, typename T0>
-struct SetArg
-{
- static void set (Kernel kernel, T0 arg)
- {
- kernel.setArg(index, arg);
- }
-};
-
-template<int index>
-struct SetArg<index, NullType>
-{
- static void set (Kernel, NullType)
- {
- }
-};
-
-template <
- typename T0, typename T1, typename T2, typename T3,
- typename T4, typename T5, typename T6, typename T7,
- typename T8, typename T9, typename T10, typename T11,
- typename T12, typename T13, typename T14, typename T15,
- typename T16, typename T17, typename T18, typename T19,
- typename T20, typename T21, typename T22, typename T23,
- typename T24, typename T25, typename T26, typename T27,
- typename T28, typename T29, typename T30, typename T31
->
-class KernelFunctorGlobal
-{
-private:
- Kernel kernel_;
-
-public:
- KernelFunctorGlobal(
- Kernel kernel) :
- kernel_(kernel)
- {}
-
- KernelFunctorGlobal(
- const Program& program,
- const STRING_CLASS name,
- cl_int * err = NULL) :
- kernel_(program, name.c_str(), err)
- {}
-
- Event operator() (
- const EnqueueArgs& args,
- T0 t0,
- T1 t1 = NullType(),
- T2 t2 = NullType(),
- T3 t3 = NullType(),
- T4 t4 = NullType(),
- T5 t5 = NullType(),
- T6 t6 = NullType(),
- T7 t7 = NullType(),
- T8 t8 = NullType(),
- T9 t9 = NullType(),
- T10 t10 = NullType(),
- T11 t11 = NullType(),
- T12 t12 = NullType(),
- T13 t13 = NullType(),
- T14 t14 = NullType(),
- T15 t15 = NullType(),
- T16 t16 = NullType(),
- T17 t17 = NullType(),
- T18 t18 = NullType(),
- T19 t19 = NullType(),
- T20 t20 = NullType(),
- T21 t21 = NullType(),
- T22 t22 = NullType(),
- T23 t23 = NullType(),
- T24 t24 = NullType(),
- T25 t25 = NullType(),
- T26 t26 = NullType(),
- T27 t27 = NullType(),
- T28 t28 = NullType(),
- T29 t29 = NullType(),
- T30 t30 = NullType(),
- T31 t31 = NullType()
- )
- {
- Event event;
- SetArg<0, T0>::set(kernel_, t0);
- SetArg<1, T1>::set(kernel_, t1);
- SetArg<2, T2>::set(kernel_, t2);
- SetArg<3, T3>::set(kernel_, t3);
- SetArg<4, T4>::set(kernel_, t4);
- SetArg<5, T5>::set(kernel_, t5);
- SetArg<6, T6>::set(kernel_, t6);
- SetArg<7, T7>::set(kernel_, t7);
- SetArg<8, T8>::set(kernel_, t8);
- SetArg<9, T9>::set(kernel_, t9);
- SetArg<10, T10>::set(kernel_, t10);
- SetArg<11, T11>::set(kernel_, t11);
- SetArg<12, T12>::set(kernel_, t12);
- SetArg<13, T13>::set(kernel_, t13);
- SetArg<14, T14>::set(kernel_, t14);
- SetArg<15, T15>::set(kernel_, t15);
- SetArg<16, T16>::set(kernel_, t16);
- SetArg<17, T17>::set(kernel_, t17);
- SetArg<18, T18>::set(kernel_, t18);
- SetArg<19, T19>::set(kernel_, t19);
- SetArg<20, T20>::set(kernel_, t20);
- SetArg<21, T21>::set(kernel_, t21);
- SetArg<22, T22>::set(kernel_, t22);
- SetArg<23, T23>::set(kernel_, t23);
- SetArg<24, T24>::set(kernel_, t24);
- SetArg<25, T25>::set(kernel_, t25);
- SetArg<26, T26>::set(kernel_, t26);
- SetArg<27, T27>::set(kernel_, t27);
- SetArg<28, T28>::set(kernel_, t28);
- SetArg<29, T29>::set(kernel_, t29);
- SetArg<30, T30>::set(kernel_, t30);
- SetArg<31, T31>::set(kernel_, t31);
-
- args.queue_.enqueueNDRangeKernel(
- kernel_,
- args.offset_,
- args.global_,
- args.local_,
- &args.events_,
- &event);
-
- return event;
- }
-
-};
-
-//------------------------------------------------------------------------------------------------------
-
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24,
- typename T25,
- typename T26,
- typename T27,
- typename T28,
- typename T29,
- typename T30,
- typename T31>
-struct functionImplementation_
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29,
- T30,
- T31> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29,
- T30,
- T31);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24,
- T25 arg25,
- T26 arg26,
- T27 arg27,
- T28 arg28,
- T29 arg29,
- T30 arg30,
- T31 arg31)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24,
- arg25,
- arg26,
- arg27,
- arg28,
- arg29,
- arg30,
- arg31);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24,
- typename T25,
- typename T26,
- typename T27,
- typename T28,
- typename T29,
- typename T30>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29,
- T30,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29,
- T30,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29,
- T30);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24,
- T25 arg25,
- T26 arg26,
- T27 arg27,
- T28 arg28,
- T29 arg29,
- T30 arg30)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24,
- arg25,
- arg26,
- arg27,
- arg28,
- arg29,
- arg30);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24,
- typename T25,
- typename T26,
- typename T27,
- typename T28,
- typename T29>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- T29);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24,
- T25 arg25,
- T26 arg26,
- T27 arg27,
- T28 arg28,
- T29 arg29)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24,
- arg25,
- arg26,
- arg27,
- arg28,
- arg29);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24,
- typename T25,
- typename T26,
- typename T27,
- typename T28>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- T28);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24,
- T25 arg25,
- T26 arg26,
- T27 arg27,
- T28 arg28)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24,
- arg25,
- arg26,
- arg27,
- arg28);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24,
- typename T25,
- typename T26,
- typename T27>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- T27);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24,
- T25 arg25,
- T26 arg26,
- T27 arg27)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24,
- arg25,
- arg26,
- arg27);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24,
- typename T25,
- typename T26>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- T26);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24,
- T25 arg25,
- T26 arg26)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24,
- arg25,
- arg26);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24,
- typename T25>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- T25);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24,
- T25 arg25)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24,
- arg25);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23,
- typename T24>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- T24);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23,
- T24 arg24)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23,
- arg24);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22,
- typename T23>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- T23);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22,
- T23 arg23)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22,
- arg23);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21,
- typename T22>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- T22);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21,
- T22 arg22)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21,
- arg22);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20,
- typename T21>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- T21);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20,
- T21 arg21)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20,
- arg21);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19,
- typename T20>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- T20);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19,
- T20 arg20)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19,
- arg20);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18,
- typename T19>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- T19);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18,
- T19 arg19)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18,
- arg19);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17,
- typename T18>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- T18);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17,
- T18 arg18)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17,
- arg18);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16,
- typename T17>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- T17);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16,
- T17 arg17)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16,
- arg17);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15,
- typename T16>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- T16);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15,
- T16 arg16)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15,
- arg16);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14,
- typename T15>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- T15);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14,
- T15 arg15)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14,
- arg15);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13,
- typename T14>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- T14);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13,
- T14 arg14)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13,
- arg14);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12,
- typename T13>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- T13);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12,
- T13 arg13)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12,
- arg13);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11,
- typename T12>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- T12);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11,
- T12 arg12)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11,
- arg12);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10,
- typename T11>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- T11);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10,
- T11 arg11)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10,
- arg11);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9,
- typename T10>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- T10);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9,
- T10 arg10)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9,
- arg10);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8,
- typename T9>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- T9);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8,
- T9 arg9)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8,
- arg9);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7,
- typename T8>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- T8);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7,
- T8 arg8)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7,
- arg8);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6,
- typename T7>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- T7);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6,
- T7 arg7)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6,
- arg7);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5,
- typename T6>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- T6);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5,
- T6 arg6)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5,
- arg6);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4,
- typename T5>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- T5,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4,
- T5);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4,
- T5 arg5)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4,
- arg5);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3,
- typename T4>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- T4,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- T4,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3,
- T4);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3,
- T4 arg4)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3,
- arg4);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2,
- typename T3>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- T3,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- T3,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2,
- T3);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2,
- T3 arg3)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2,
- arg3);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1,
- typename T2>
-struct functionImplementation_
-< T0,
- T1,
- T2,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- T2,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1,
- T2);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1,
- T2 arg2)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1,
- arg2);
- }
-
-
-};
-
-template<
- typename T0,
- typename T1>
-struct functionImplementation_
-< T0,
- T1,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- T1,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0,
- T1);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0,
- T1 arg1)
- {
- return functor_(
- enqueueArgs,
- arg0,
- arg1);
- }
-
-
-};
-
-template<
- typename T0>
-struct functionImplementation_
-< T0,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType>
-{
- typedef detail::KernelFunctorGlobal<
- T0,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType,
- NullType> FunctorType;
-
- FunctorType functor_;
-
- functionImplementation_(const FunctorType &functor) :
- functor_(functor)
- {
-
- #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
- // Fail variadic expansion for dev11
- static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
- #endif
-
- }
-
- //! \brief Return type of the functor
- typedef Event result_type;
-
- //! \brief Function signature of kernel functor with no event dependency.
- typedef Event type_(
- const EnqueueArgs&,
- T0);
-
- Event operator()(
- const EnqueueArgs& enqueueArgs,
- T0 arg0)
- {
- return functor_(
- enqueueArgs,
- arg0);
- }
-
-
-};
-
-
-
-
-
-} // namespace detail
-
-//----------------------------------------------------------------------------------------------
-
-template <
- typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType,
- typename T3 = detail::NullType, typename T4 = detail::NullType,
- typename T5 = detail::NullType, typename T6 = detail::NullType,
- typename T7 = detail::NullType, typename T8 = detail::NullType,
- typename T9 = detail::NullType, typename T10 = detail::NullType,
- typename T11 = detail::NullType, typename T12 = detail::NullType,
- typename T13 = detail::NullType, typename T14 = detail::NullType,
- typename T15 = detail::NullType, typename T16 = detail::NullType,
- typename T17 = detail::NullType, typename T18 = detail::NullType,
- typename T19 = detail::NullType, typename T20 = detail::NullType,
- typename T21 = detail::NullType, typename T22 = detail::NullType,
- typename T23 = detail::NullType, typename T24 = detail::NullType,
- typename T25 = detail::NullType, typename T26 = detail::NullType,
- typename T27 = detail::NullType, typename T28 = detail::NullType,
- typename T29 = detail::NullType, typename T30 = detail::NullType,
- typename T31 = detail::NullType
->
-struct make_kernel :
- public detail::functionImplementation_<
- T0, T1, T2, T3,
- T4, T5, T6, T7,
- T8, T9, T10, T11,
- T12, T13, T14, T15,
- T16, T17, T18, T19,
- T20, T21, T22, T23,
- T24, T25, T26, T27,
- T28, T29, T30, T31
- >
-{
-public:
- typedef detail::KernelFunctorGlobal<
- T0, T1, T2, T3,
- T4, T5, T6, T7,
- T8, T9, T10, T11,
- T12, T13, T14, T15,
- T16, T17, T18, T19,
- T20, T21, T22, T23,
- T24, T25, T26, T27,
- T28, T29, T30, T31
- > FunctorType;
-
- make_kernel(
- const Program& program,
- const STRING_CLASS name,
- cl_int * err = NULL) :
- detail::functionImplementation_<
- T0, T1, T2, T3,
- T4, T5, T6, T7,
- T8, T9, T10, T11,
- T12, T13, T14, T15,
- T16, T17, T18, T19,
- T20, T21, T22, T23,
- T24, T25, T26, T27,
- T28, T29, T30, T31
- >(
- FunctorType(program, name, err))
- {}
-
- make_kernel(
- const Kernel kernel) :
- detail::functionImplementation_<
- T0, T1, T2, T3,
- T4, T5, T6, T7,
- T8, T9, T10, T11,
- T12, T13, T14, T15,
- T16, T17, T18, T19,
- T20, T21, T22, T23,
- T24, T25, T26, T27,
- T28, T29, T30, T31
- >(
- FunctorType(kernel))
- {}
-};
-
-
-//----------------------------------------------------------------------------------------------------------------------
-
-#undef __ERR_STR
-#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-#undef __GET_DEVICE_INFO_ERR
-#undef __GET_PLATFORM_INFO_ERR
-#undef __GET_DEVICE_IDS_ERR
-#undef __GET_CONTEXT_INFO_ERR
-#undef __GET_EVENT_INFO_ERR
-#undef __GET_EVENT_PROFILE_INFO_ERR
-#undef __GET_MEM_OBJECT_INFO_ERR
-#undef __GET_IMAGE_INFO_ERR
-#undef __GET_SAMPLER_INFO_ERR
-#undef __GET_KERNEL_INFO_ERR
-#undef __GET_KERNEL_ARG_INFO_ERR
-#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
-#undef __GET_PROGRAM_INFO_ERR
-#undef __GET_PROGRAM_BUILD_INFO_ERR
-#undef __GET_COMMAND_QUEUE_INFO_ERR
-
-#undef __CREATE_CONTEXT_ERR
-#undef __CREATE_CONTEXT_FROM_TYPE_ERR
-#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
-
-#undef __CREATE_BUFFER_ERR
-#undef __CREATE_SUBBUFFER_ERR
-#undef __CREATE_IMAGE2D_ERR
-#undef __CREATE_IMAGE3D_ERR
-#undef __CREATE_SAMPLER_ERR
-#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
-
-#undef __CREATE_USER_EVENT_ERR
-#undef __SET_USER_EVENT_STATUS_ERR
-#undef __SET_EVENT_CALLBACK_ERR
-#undef __SET_PRINTF_CALLBACK_ERR
-
-#undef __WAIT_FOR_EVENTS_ERR
-
-#undef __CREATE_KERNEL_ERR
-#undef __SET_KERNEL_ARGS_ERR
-#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
-#undef __CREATE_PROGRAM_WITH_BINARY_ERR
-#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
-#undef __BUILD_PROGRAM_ERR
-#undef __CREATE_KERNELS_IN_PROGRAM_ERR
-
-#undef __CREATE_COMMAND_QUEUE_ERR
-#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
-#undef __ENQUEUE_READ_BUFFER_ERR
-#undef __ENQUEUE_WRITE_BUFFER_ERR
-#undef __ENQUEUE_READ_BUFFER_RECT_ERR
-#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
-#undef __ENQEUE_COPY_BUFFER_ERR
-#undef __ENQEUE_COPY_BUFFER_RECT_ERR
-#undef __ENQUEUE_READ_IMAGE_ERR
-#undef __ENQUEUE_WRITE_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
-#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
-#undef __ENQUEUE_MAP_BUFFER_ERR
-#undef __ENQUEUE_MAP_IMAGE_ERR
-#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
-#undef __ENQUEUE_NDRANGE_KERNEL_ERR
-#undef __ENQUEUE_TASK_ERR
-#undef __ENQUEUE_NATIVE_KERNEL
-
-#undef __CL_EXPLICIT_CONSTRUCTORS
-
-#undef __UNLOAD_COMPILER_ERR
-#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
-
-#undef __CL_FUNCTION_TYPE
-
-// Extensions
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_VERSION_1_1)
-#undef __INIT_CL_EXT_FCN_PTR
-#endif // #if defined(CL_VERSION_1_1)
-#undef __CREATE_SUB_DEVICES
-
-#if defined(USE_CL_DEVICE_FISSION)
-#undef __PARAM_NAME_DEVICE_FISSION
-#endif // USE_CL_DEVICE_FISSION
-
-#undef __DEFAULT_NOT_INITIALIZED
-#undef __DEFAULT_BEING_INITIALIZED
-#undef __DEFAULT_INITIALIZED
-
-} // namespace cl
-
-#ifdef _WIN32
-#pragma pop_macro("max")
-#endif // _WIN32
-
-#endif // CL_HPP_
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
index b6c90b3..d5960a4 100644
--- a/include/CL/cl_d3d10.h
+++ b/include/CL/cl_d3d10.h
@@ -1,5 +1,5 @@
/**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h
index 2e0a63f..39f9072 100644
--- a/include/CL/cl_d3d11.h
+++ b/include/CL/cl_d3d11.h
@@ -1,5 +1,5 @@
/**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
index 23f1631..2729e8b 100644
--- a/include/CL/cl_dx9_media_sharing.h
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -1,5 +1,5 @@
/**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -33,7 +38,7 @@
extern "C" {
#endif
-/******************************************************************************
+/******************************************************************************/
/* cl_khr_dx9_media_sharing */
#define cl_khr_dx9_media_sharing 1
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
index 93e6c9c..a765bd5 100644
--- a/include/CL/cl_egl.h
+++ b/include/CL/cl_egl.h
@@ -1,5 +1,5 @@
/*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -28,8 +33,6 @@
#else
#include <CL/cl.h>
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
#endif
#ifdef __cplusplus
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 0a66d70..fa34cba 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -1,5 +1,5 @@
/*******************************************************************************
- * Copyright (c) 2008-2013 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -134,15 +139,15 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
* cl_khr_initalize_memory extension *
*************************************/
-#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x200E
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030
/**************************************
* cl_khr_terminate_context extension *
**************************************/
-#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x200F
-#define CL_CONTEXT_TERMINATE_KHR 0x2010
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031
+#define CL_CONTEXT_TERMINATE_KHR 0x2032
#define cl_khr_terminate_context 1
extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
@@ -384,7 +389,7 @@ typedef struct _cl_mem_ext_host_ptr
/* Legal values will be defined in layered extensions. */
cl_uint allocation_type;
- /* Host cache policy for this external memory allocation. */
+ /* Host cache policy for this external memory allocation. */
cl_uint host_cache_policy;
} cl_mem_ext_host_ptr;
@@ -411,6 +416,40 @@ typedef struct _cl_mem_ion_host_ptr
#endif /* CL_VERSION_1_1 */
+
+#ifdef CL_VERSION_2_0
+/*********************************
+* cl_khr_sub_groups extension
+*********************************/
+#define cl_khr_sub_groups 1
+
+typedef cl_uint cl_kernel_sub_group_info;
+
+/* cl_khr_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+ cl_device_id /*in_device*/,
+ cl_kernel_sub_group_info /* param_name */,
+ size_t /*input_value_size*/,
+ const void * /*input_value*/,
+ size_t /*param_value_size*/,
+ void* /*param_value*/,
+ size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int
+ ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+ cl_device_id /*in_device*/,
+ cl_kernel_sub_group_info /* param_name */,
+ size_t /*input_value_size*/,
+ const void * /*input_value*/,
+ size_t /*param_value_size*/,
+ void* /*param_value*/,
+ size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+#endif /* CL_VERSION_2_0 */
+
#ifdef __cplusplus
}
#endif
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
index e52c1b6..945daa8 100644
--- a/include/CL/cl_gl.h
+++ b/include/CL/cl_gl.h
@@ -1,5 +1,5 @@
/**********************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
index 77d5353..e3c14c6 100644
--- a/include/CL/cl_gl_ext.h
+++ b/include/CL/cl_gl_ext.h
@@ -1,5 +1,5 @@
/**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
index 7f6f5e8..f157b63 100644
--- a/include/CL/cl_platform.h
+++ b/include/CL/cl_platform.h
@@ -1,5 +1,5 @@
/**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -45,6 +50,14 @@ extern "C" {
#define CL_CALLBACK
#endif
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
#ifdef __APPLE__
#define CL_EXTENSION_WEAK_LINK __attribute__((weak_import))
#define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
@@ -75,6 +88,8 @@ extern "C" {
#define CL_EXT_SUFFIX__VERSION_1_1
#define CL_API_SUFFIX__VERSION_1_2
#define CL_EXT_SUFFIX__VERSION_1_2
+ #define CL_API_SUFFIX__VERSION_2_0
+ #define CL_EXT_SUFFIX__VERSION_2_0
#ifdef __GNUC__
#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
@@ -92,9 +107,17 @@ extern "C" {
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
#endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+ #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+ #endif
#elif _WIN32
#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
- #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
#else
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
@@ -108,12 +131,23 @@ extern "C" {
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
#endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
+ #endif
#else
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+
+ #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
#endif
#endif
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
index 3f00524..9855cd7 100644
--- a/include/CL/opencl.h
+++ b/include/CL/opencl.h
@@ -1,5 +1,5 @@
/*******************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ * https://www.khronos.org/registry/
+ *
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
diff --git a/kernels/compiler_atomic_functions_20.cl b/kernels/compiler_atomic_functions_20.cl
new file mode 100644
index 0000000..cbca52e
--- /dev/null
+++ b/kernels/compiler_atomic_functions_20.cl
@@ -0,0 +1,53 @@
+__kernel void compiler_atomic_functions_20(__global int *dst, __local int *tmp, __global int *src) {
+ int lid = get_local_id(0);
+ int i = lid % 12;
+ atomic_int* p = (atomic_int*)tmp;
+ if(lid == 0) {
+ for(int j=0; j<12; j=j+1) {
+ atomic_exchange(&p[j], 0);
+ }
+ atomic_exchange(&p[4], -1);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ int compare = 0;
+
+ switch(i) {
+ case 0: atomic_inc(&tmp[i]); break;
+ case 1: atomic_dec(&tmp[i]); break;
+ case 2: atomic_fetch_add(&p[i], src[lid]); break;
+ case 3: atomic_fetch_sub(&p[i], src[lid]); break;
+ case 4: atomic_fetch_and(&p[i], ~(src[lid]<<(lid / 16))); break;
+ case 5: atomic_fetch_or (&p[i], src[lid]<<(lid / 16)); break;
+ case 6: atomic_fetch_xor(&p[i], src[lid]); break;
+ case 7: atomic_fetch_min(&p[i], -src[lid]); break;
+ case 8: atomic_fetch_max(&p[i], src[lid]); break;
+ case 9: atomic_fetch_min((atomic_uint*)&p[i], -src[lid]); break;
+ case 10: atomic_fetch_max((atomic_uint*)&p[i], src[lid]); break;
+ case 11: atomic_compare_exchange_strong(&p[i], &compare, src[10]); break;
+ default: break;
+ }
+
+ atomic_int* d = (atomic_int*)dst;
+ switch(i) {
+ case 0: atomic_inc(&dst[i]); break;
+ case 1: atomic_dec(&dst[i]); break;
+ case 2: atomic_fetch_add(&d[i], src[lid]); break;
+ case 3: atomic_fetch_sub(&d[i], src[lid]); break;
+ case 4: atomic_fetch_and(&d[i], ~(src[lid]<<(lid / 16))); break;
+ case 5: atomic_fetch_or (&d[i], src[lid]<<(lid / 16)); break;
+ case 6: atomic_fetch_xor(&d[i], src[lid]); break;
+ case 7: atomic_fetch_min(&d[i], -src[lid]); break;
+ case 8: atomic_fetch_max(&d[i], src[lid]); break;
+ case 9: atomic_fetch_min((atomic_uint*)&d[i], -src[lid]); break;
+ case 10: atomic_fetch_max((atomic_uint*)&d[i], src[lid]); break;
+ case 11: atomic_compare_exchange_strong(&d[i], &compare, src[10]); break;
+ default: break;
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ if(get_global_id(0) == 0) {
+ for(i=0; i<12; i=i+1)
+ atomic_xchg(&dst[i+12], tmp[i]);
+ }
+}
diff --git a/kernels/compiler_ceil64.spir b/kernels/compiler_ceil64.spir
new file mode 100644
index 0000000..8357836
Binary files /dev/null and b/kernels/compiler_ceil64.spir differ
diff --git a/kernels/compiler_ctz.cl b/kernels/compiler_ctz.cl
new file mode 100644
index 0000000..8acdfb9
--- /dev/null
+++ b/kernels/compiler_ctz.cl
@@ -0,0 +1,16 @@
+#define COMPILER_CTZ(TYPE) \
+ kernel void compiler_ctz_##TYPE(global TYPE* src, global TYPE* dst) \
+{ \
+ __global TYPE* A = &src[get_global_id(0)]; \
+ __global TYPE* B = &dst[get_global_id(0)]; \
+ *B = ctz(*A); \
+}
+
+COMPILER_CTZ(ulong)
+COMPILER_CTZ(uint)
+COMPILER_CTZ(ushort)
+COMPILER_CTZ(uchar)
+COMPILER_CTZ(long)
+COMPILER_CTZ(int)
+COMPILER_CTZ(short)
+COMPILER_CTZ(char)
diff --git a/kernels/compiler_device_enqueue.cl b/kernels/compiler_device_enqueue.cl
new file mode 100644
index 0000000..cb20142
--- /dev/null
+++ b/kernels/compiler_device_enqueue.cl
@@ -0,0 +1,18 @@
+void block_fn(__global uint* val)
+{
+ atomic_add(val, get_global_id(0));
+}
+
+kernel void compiler_device_enqueue(uint glob_size_arr, __global uint* val)
+{
+ size_t tid = get_global_id(0);
+
+ for(int i = 0; i < glob_size_arr; i++)
+ {
+ ndrange_t ndrange = ndrange_1D(glob_size_arr);
+ __global uint * v = val + tid;
+ void (^kernelBlock)(void) = ^{ block_fn(v); };
+ queue_t q = get_default_queue();
+ enqueue_kernel(q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+ }
+}
diff --git a/kernels/compiler_generic_atomic.cl b/kernels/compiler_generic_atomic.cl
new file mode 100644
index 0000000..3db49e2
--- /dev/null
+++ b/kernels/compiler_generic_atomic.cl
@@ -0,0 +1,33 @@
+#define GENERIC_KERNEL(T) \
+kernel void compiler_generic_atomic_##T(global T *src, global T *dst) \
+{ \
+ size_t gid = get_global_id(0); \
+ size_t lid = get_local_id(0); \
+ private T pdata[16]; \
+ local T ldata[16]; \
+ generic T * p1 = &pdata[lid]; \
+ generic T * p2 = &ldata[lid]; \
+ generic T *p = (gid & 1) ? p1 : p2; \
+ /* below expression is not supported by clang now */ \
+ /* generic T *p = (gid & 1) ? p1 : (T *)&ldata[lid]; */ \
+ *p = src[gid]; \
+ /* fill other data */ \
+ if(gid&1) { \
+ ldata[lid] = 20; \
+ } else { \
+ for (int i = 0; i < 16; i++) { \
+ pdata[i] = src[lid];; \
+ } \
+ } \
+ barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+ generic T * q1 = &pdata[lid]; \
+ generic T * q2 = &ldata[lid]; \
+ generic T *q = (gid & 1) ? q1 : q2; \
+ atomic_fetch_add((atomic_int*)q , pdata[lid]); \
+ dst[gid] = *q; \
+}
+
+GENERIC_KERNEL(int)
+//GENERIC_KERNEL(long)
+
diff --git a/kernels/compiler_generic_pointer.cl b/kernels/compiler_generic_pointer.cl
new file mode 100644
index 0000000..a06b192
--- /dev/null
+++ b/kernels/compiler_generic_pointer.cl
@@ -0,0 +1,33 @@
+#define GENERIC_KERNEL(T) \
+kernel void compiler_generic_pointer_##T(global T *src, global T *dst) \
+{ \
+ size_t gid = get_global_id(0); \
+ size_t lid = get_local_id(0); \
+ private T pdata[16]; \
+ local T ldata[16]; \
+ generic T * p1 = &pdata[lid]; \
+ generic T * p2 = &ldata[lid]; \
+ generic T *p = (gid & 1) ? p1 : p2; \
+ /* below expression is not supported by clang now */ \
+ /* generic T *p = (gid & 1) ? p1 : (T *)&ldata[lid]; */ \
+ *p = src[gid]; \
+ /* fill other data */ \
+ if(gid&1) { \
+ ldata[lid] = 20; \
+ } else { \
+ for (int i = 0; i < 16; i++) { \
+ pdata[i] = src[lid];; \
+ } \
+ } \
+ barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+ generic T * q1 = &pdata[lid]; \
+ generic T * q2 = &ldata[lid]; \
+ generic T *q = (gid & 1) ? q1 : q2; \
+ dst[gid] = *q + pdata[lid]; \
+}
+
+GENERIC_KERNEL(int)
+GENERIC_KERNEL(char)
+GENERIC_KERNEL(short)
+GENERIC_KERNEL(long)
diff --git a/kernels/compiler_pipe_builtin.cl b/kernels/compiler_pipe_builtin.cl
new file mode 100644
index 0000000..4e8dcc4
--- /dev/null
+++ b/kernels/compiler_pipe_builtin.cl
@@ -0,0 +1,117 @@
+typedef struct{
+ int a;
+ int b;
+}mystruct;
+
+__kernel void compiler_pipe_convenience_write_int(write_only pipe int p, __global int *src)
+{
+ int gid = get_global_id(0);
+ write_pipe(p, &src[gid]);
+}
+__kernel void compiler_pipe_convenience_read_int(read_only pipe int p, __global int *dst)
+{
+ int gid = get_global_id(0);
+ read_pipe(p, &dst[gid]);
+}
+__kernel void compiler_pipe_convenience_write_mystruct(write_only pipe mystruct p, __global mystruct *src)
+{
+ int gid = get_global_id(0);
+ write_pipe(p, &src[gid]);
+}
+__kernel void compiler_pipe_convenience_read_mystruct(read_only pipe mystruct p, __global mystruct *dst)
+{
+ int gid = get_global_id(0);
+ read_pipe(p, &dst[gid]);
+}
+
+__kernel void compiler_pipe_reserve_write_int(write_only pipe int p, __global int *src)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = reserve_write_pipe(p, 1);
+ if(is_valid_reserve_id(res_id))
+ {
+ write_pipe(p, res_id, 0, &src[gid]);
+ commit_write_pipe(p, res_id);
+ }
+}
+__kernel void compiler_pipe_reserve_read_int(read_only pipe int p, __global int *dst)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = reserve_read_pipe(p, 1);
+ if(is_valid_reserve_id(res_id))
+ {
+ read_pipe(p, res_id, 0, &dst[gid]);
+ commit_read_pipe(p, res_id);
+ }
+}
+__kernel void compiler_pipe_reserve_write_mystruct(write_only pipe mystruct p, __global mystruct *src)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = reserve_write_pipe(p, 1);
+ if(is_valid_reserve_id(res_id))
+ {
+ write_pipe(p, res_id, 0, &src[gid]);
+ commit_write_pipe(p, res_id);
+ }
+}
+__kernel void compiler_pipe_reserve_read_mystruct(read_only pipe mystruct p, __global mystruct *dst)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = reserve_read_pipe(p, 1);
+ if(is_valid_reserve_id(res_id))
+ {
+ read_pipe(p, res_id, 0, &dst[gid]);
+ commit_read_pipe(p, res_id);
+ }
+}
+
+__kernel void compiler_pipe_workgroup_write_int(write_only pipe int p, __global int *src)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = work_group_reserve_write_pipe(p, get_local_size(0));
+ if(is_valid_reserve_id(res_id))
+ {
+ write_pipe(p, res_id, get_local_id(0), &src[gid]);
+ work_group_commit_write_pipe(p, res_id);
+ }
+}
+__kernel void compiler_pipe_workgroup_read_int(read_only pipe int p, __global int *dst)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = work_group_reserve_read_pipe(p, get_local_size(0));
+ if(is_valid_reserve_id(res_id))
+ {
+ read_pipe(p, res_id, get_local_id(0), &dst[gid]);
+ work_group_commit_read_pipe(p, res_id);
+ }
+}
+__kernel void compiler_pipe_workgroup_write_mystruct(write_only pipe mystruct p, __global mystruct *src)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = work_group_reserve_write_pipe(p, get_local_size(0));
+ if(is_valid_reserve_id(res_id))
+ {
+ write_pipe(p, res_id, get_local_id(0), &src[gid]);
+ work_group_commit_write_pipe(p, res_id);
+ }
+}
+__kernel void compiler_pipe_workgroup_read_mystruct(read_only pipe mystruct p, __global mystruct *dst)
+{
+ int gid = get_global_id(0);
+ reserve_id_t res_id = work_group_reserve_read_pipe(p, get_local_size(0));
+ if(is_valid_reserve_id(res_id))
+ {
+ read_pipe(p, res_id, get_local_id(0), &dst[gid]);
+ work_group_commit_read_pipe(p, res_id);
+ }
+}
+
+__kernel void compiler_pipe_query(write_only pipe int p, __global uint *src)
+{
+ int gid = get_global_id(0);
+ write_pipe(p,&gid);
+ if(gid == 0) {
+ src[0] = get_pipe_num_packets(p);
+ src[1] = get_pipe_max_packets(p);
+ }
+}
diff --git a/kernels/compiler_program_global.cl b/kernels/compiler_program_global.cl
new file mode 100644
index 0000000..fbe030f
--- /dev/null
+++ b/kernels/compiler_program_global.cl
@@ -0,0 +1,77 @@
+struct config{
+ int s0;
+ global short *s1;
+};
+
+global int i = 5;
+global int bb = 4;
+global int *global p;
+
+/* array */
+global int ba[12];
+
+/* short/long data type */
+global short s;
+global short s2;
+global long l;
+
+/* pointer in constant AS to global */
+global int * constant px =&i;
+
+/* constant pointer relocation */
+constant int x = 2;
+constant int y =1;
+constant int *constant z[2] = {&x, &y};
+
+/* structure with pointer field */
+global struct config c[2] = {{1, &s}, {2, &s2} };
+
+
+global int a = 1;
+global int b = 2;
+global int * constant gArr[2]= {&a, &b};
+
+global int a_var[1] = {0};
+global int *p_var = a_var;
+
+__kernel void compiler_program_global0(const global int *src, int dynamic) {
+ size_t gid = get_global_id(0);
+ /* global read/write */
+ p = &i;
+ *p += 1;
+
+ /* pointer in struct memory access */
+ *c[gid&1].s1 += 2;
+
+ s = 2;
+ l = 3;
+
+ /* constant AS pointer (points to global) memory access */
+ *px += *z[dynamic];
+
+ p = &bb;
+ /* array */
+ if (gid < 11)
+ ba[gid] = src[gid];
+}
+
+__kernel void compiler_program_global1(global int *dst, int dynamic) {
+ size_t gid = get_global_id(0);
+// static global sg;
+
+ dst[11] = i;
+ dst[12] = *p;
+ dst[13] = s;
+ dst[14] = l;
+ if (p_var == a_var)
+ dst[15] = *gArr[dynamic];
+
+ if (gid < 11)
+ dst[gid] = ba[gid];
+}
+
+__kernel void nouse(int dynamic) {
+ c[0].s1 = &s2;
+ p_var = a+dynamic;
+}
+
diff --git a/kernels/compiler_sub_group_shuffle.cl b/kernels/compiler_sub_group_shuffle.cl
index 322da74..c771eea 100644
--- a/kernels/compiler_sub_group_shuffle.cl
+++ b/kernels/compiler_sub_group_shuffle.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_int(global int *dst, int c)
{
int i = get_global_id(0);
if (i == 0)
@@ -16,3 +16,23 @@ __kernel void compiler_sub_group_shuffle(global int *dst, int c)
dst[i*4+2] = o2;
dst[i*4+3] = o3;
}
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_short(global short *dst, int c)
+{
+ short i = get_global_id(0);
+ if (i == 0)
+ dst[0] = get_max_sub_group_size();
+ dst++;
+
+ short from = i;
+ int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+ short o0 = get_sub_group_local_id();
+ short o1 = intel_sub_group_shuffle(from, c);
+ short o2 = intel_sub_group_shuffle(from, 5);
+ short o3 = intel_sub_group_shuffle(from, j);
+ dst[i*4] = o0;
+ dst[i*4+1] = o1;
+ dst[i*4+2] = o2;
+ dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_down.cl b/kernels/compiler_sub_group_shuffle_down.cl
index 769fc3f..40bac05 100644
--- a/kernels/compiler_sub_group_shuffle_down.cl
+++ b/kernels/compiler_sub_group_shuffle_down.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_down_int(global int *dst, int c)
{
int i = get_global_id(0);
if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
dst[i*4+2] = o2;
dst[i*4+3] = o3;
}
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_down_short(global short *dst, int c)
+{
+ short i = get_global_id(0);
+ if (i == 0)
+ dst[0] = get_max_sub_group_size();
+ dst++;
+
+ short from = i;
+ int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+ int k = get_sub_group_local_id() + 1;
+ short o0 = intel_sub_group_shuffle_down((short)123, (short)456, c);
+ short o1 = intel_sub_group_shuffle_down((short)123, from, c);
+ short o2 = intel_sub_group_shuffle_down(from, (short)-from, k);
+ short o3 = intel_sub_group_shuffle_down(from, (short)321, j);
+ dst[i*4] = o0;
+ dst[i*4+1] = o1;
+ dst[i*4+2] = o2;
+ dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_up.cl b/kernels/compiler_sub_group_shuffle_up.cl
index 5c5cee1..fd287d5 100644
--- a/kernels/compiler_sub_group_shuffle_up.cl
+++ b/kernels/compiler_sub_group_shuffle_up.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_up_int(global int *dst, int c)
{
int i = get_global_id(0);
if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
dst[i*4+2] = o2;
dst[i*4+3] = o3;
}
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_up_short(global short *dst, int c)
+{
+ short i = get_global_id(0);
+ if (i == 0)
+ dst[0] = get_max_sub_group_size();
+ dst++;
+
+ short from = i;
+ int j = get_sub_group_local_id() + 1;
+ int k = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+ short o0 = intel_sub_group_shuffle_up((short)123, (short)456, c);
+ short o1 = intel_sub_group_shuffle_up((short)123, from, c);
+ short o2 = intel_sub_group_shuffle_up(from, (short)-from, k);
+ short o3 = intel_sub_group_shuffle_up(from, (short)321, j);
+ dst[i*4] = o0;
+ dst[i*4+1] = o1;
+ dst[i*4+2] = o2;
+ dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_xor.cl b/kernels/compiler_sub_group_shuffle_xor.cl
index 8bc15d3..df3dfe7 100644
--- a/kernels/compiler_sub_group_shuffle_xor.cl
+++ b/kernels/compiler_sub_group_shuffle_xor.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_xor_int(global int *dst, int c)
{
int i = get_global_id(0);
if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
dst[i*4+2] = o2;
dst[i*4+3] = o3;
}
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_xor_short(global short *dst, int c)
+{
+ short i = get_global_id(0);
+ if (i == 0)
+ dst[0] = get_max_sub_group_size();
+ dst++;
+
+ short from = i;
+ int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+ int k = get_sub_group_local_id() + 1;
+ short o0 = get_sub_group_local_id();
+ short o1 = intel_sub_group_shuffle_xor(from, c);
+ short o2 = intel_sub_group_shuffle_xor(from, j);
+ short o3 = intel_sub_group_shuffle_xor(from, k);
+ dst[i*4] = o0;
+ dst[i*4+1] = o1;
+ dst[i*4+2] = o2;
+ dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_subgroup_broadcast.cl b/kernels/compiler_subgroup_broadcast.cl
index 96d38d9..3d16d67 100644
--- a/kernels/compiler_subgroup_broadcast.cl
+++ b/kernels/compiler_subgroup_broadcast.cl
@@ -1,7 +1,7 @@
/*
* Subgroup broadcast 1D functions
*/
-
+#ifndef HALF
kernel void compiler_subgroup_broadcast_imm_int(global int *src,
global int *dst,
uint simd_id)
@@ -32,3 +32,27 @@ kernel void compiler_subgroup_broadcast_long(global long *src,
long broadcast_val = sub_group_broadcast(val, simd_id);
dst[index] = broadcast_val;
}
+kernel void compiler_subgroup_broadcast_short(global short *src,
+ global short *dst,
+ uint simd_id)
+{
+ uint index = get_global_id(0);
+
+ short val = src[index];
+ short broadcast_val = sub_group_broadcast(val, simd_id);
+ dst[index] = broadcast_val;
+}
+#else
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_broadcast_half(global half *src,
+ global half *dst,
+ uint simd_id)
+{
+ uint index = get_global_id(0);
+
+ half val = src[index];
+ half broadcast_val = sub_group_broadcast(val, simd_id);
+ //printf("%d val %d is %d\n",index,as_ushort(val), as_ushort(broadcast_val));
+ dst[index] = broadcast_val;
+}
+#endif
diff --git a/kernels/compiler_subgroup_buffer_block_read.cl b/kernels/compiler_subgroup_buffer_block_read.cl
index 9edaa2e..4cbf894 100644
--- a/kernels/compiler_subgroup_buffer_block_read.cl
+++ b/kernels/compiler_subgroup_buffer_block_read.cl
@@ -1,31 +1,62 @@
-__kernel void compiler_subgroup_buffer_block_read1(global uint *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui1(global uint *src, global uint *dst)
{
int id = get_global_id(0);
global uint * p = src + get_sub_group_id() * get_max_sub_group_size();
- uint tmp = intel_sub_group_block_read(p);
+ uint tmp = intel_sub_group_block_read_ui(p);
dst[id] = tmp;
}
-__kernel void compiler_subgroup_buffer_block_read2(global uint *src, global uint2 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui2(global uint *src, global uint2 *dst)
{
int id = get_global_id(0);
global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
- uint2 tmp = intel_sub_group_block_read2(p);
+ uint2 tmp = intel_sub_group_block_read_ui2(p);
dst[id] = tmp;
}
-__kernel void compiler_subgroup_buffer_block_read4(global uint *src, global uint4 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui4(global uint *src, global uint4 *dst)
{
int id = get_global_id(0);
global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
- uint4 tmp = intel_sub_group_block_read4(p);
+ uint4 tmp = intel_sub_group_block_read_ui4(p);
dst[id] = tmp;
}
-__kernel void compiler_subgroup_buffer_block_read8(global uint *src, global uint8 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui8(global uint *src, global uint8 *dst)
{
int id = get_global_id(0);
global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
- uint8 tmp = intel_sub_group_block_read8(p);
+ uint8 tmp = intel_sub_group_block_read_ui8(p);
dst[id] = tmp;
}
+#ifdef SHORT
+__kernel void compiler_subgroup_buffer_block_read_us1(global ushort *src, global ushort *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = src + get_sub_group_id() * get_max_sub_group_size();
+ ushort tmp = intel_sub_group_block_read_us(p);
+ dst[id] = tmp;
+}
+__kernel void compiler_subgroup_buffer_block_read_us2(global ushort *src, global ushort2 *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
+ ushort2 tmp = intel_sub_group_block_read_us2(p);
+ dst[id] = tmp;
+}
+__kernel void compiler_subgroup_buffer_block_read_us4(global ushort *src, global ushort4 *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
+ ushort4 tmp = intel_sub_group_block_read_us4(p);
+ dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_buffer_block_read_us8(global ushort *src, global ushort8 *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
+ ushort8 tmp = intel_sub_group_block_read_us8(p);
+ dst[id] = tmp;
+}
+#endif
diff --git a/kernels/compiler_subgroup_buffer_block_write.cl b/kernels/compiler_subgroup_buffer_block_write.cl
index f735855..f452dcc 100644
--- a/kernels/compiler_subgroup_buffer_block_write.cl
+++ b/kernels/compiler_subgroup_buffer_block_write.cl
@@ -1,27 +1,55 @@
-__kernel void compiler_subgroup_buffer_block_write1(global uint *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui1(global uint *src, global uint *dst)
{
int id = get_global_id(0);
global uint * p = dst + get_sub_group_id() * get_max_sub_group_size();
- intel_sub_group_block_write(p,src[id]);
+ intel_sub_group_block_write_ui(p,src[id]);
}
-__kernel void compiler_subgroup_buffer_block_write2(global uint2 *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui2(global uint2 *src, global uint *dst)
{
int id = get_global_id(0);
global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*2;
- intel_sub_group_block_write2(p,src[id]);
+ intel_sub_group_block_write_ui2(p,src[id]);
}
-__kernel void compiler_subgroup_buffer_block_write4(global uint4 *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui4(global uint4 *src, global uint *dst)
{
int id = get_global_id(0);
global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*4;
- intel_sub_group_block_write4(p,src[id]);
+ intel_sub_group_block_write_ui4(p,src[id]);
}
-__kernel void compiler_subgroup_buffer_block_write8(global uint8 *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui8(global uint8 *src, global uint *dst)
{
int id = get_global_id(0);
global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*8;
- intel_sub_group_block_write8(p,src[id]);
+ intel_sub_group_block_write_ui8(p,src[id]);
}
+#ifdef SHORT
+__kernel void compiler_subgroup_buffer_block_write_us1(global ushort *src, global ushort *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size();
+ intel_sub_group_block_write_us(p,src[id]);
+}
+
+__kernel void compiler_subgroup_buffer_block_write_us2(global ushort2 *src, global ushort *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size()*2;
+ intel_sub_group_block_write_us2(p,src[id]);
+}
+
+__kernel void compiler_subgroup_buffer_block_write_us4(global ushort4 *src, global ushort *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size()*4;
+ intel_sub_group_block_write_us4(p,src[id]);
+}
+__kernel void compiler_subgroup_buffer_block_write_us8(global ushort8 *src, global ushort *dst)
+{
+ int id = get_global_id(0);
+ global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size()*8;
+ intel_sub_group_block_write_us8(p,src[id]);
+}
+#endif
diff --git a/kernels/compiler_subgroup_image_block_read.cl b/kernels/compiler_subgroup_image_block_read.cl
index d5df6db..fa079b7 100644
--- a/kernels/compiler_subgroup_image_block_read.cl
+++ b/kernels/compiler_subgroup_image_block_read.cl
@@ -1,31 +1,64 @@
-__kernel void compiler_subgroup_image_block_read1(image2d_t src, global uint *dst)
+__kernel void compiler_subgroup_image_block_read_ui1(image2d_t src, global uint *dst)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- uint tmp = intel_sub_group_block_read(src,coord);
+ uint tmp = intel_sub_group_block_read_ui(src,coord);
dst[id] = tmp;
}
-__kernel void compiler_subgroup_image_block_read2(image2d_t src, global uint2 *dst)
+__kernel void compiler_subgroup_image_block_read_ui2(image2d_t src, global uint2 *dst)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- uint2 tmp = intel_sub_group_block_read2(src,coord);
+ uint2 tmp = intel_sub_group_block_read_ui2(src,coord);
dst[id] = tmp;
}
-__kernel void compiler_subgroup_image_block_read4(image2d_t src, global uint4 *dst)
+__kernel void compiler_subgroup_image_block_read_ui4(image2d_t src, global uint4 *dst)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- uint4 tmp = intel_sub_group_block_read4(src,coord);
+ uint4 tmp = intel_sub_group_block_read_ui4(src,coord);
dst[id] = tmp;
}
-__kernel void compiler_subgroup_image_block_read8(image2d_t src, global uint8 *dst)
+__kernel void compiler_subgroup_image_block_read_ui8(image2d_t src, global uint8 *dst)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- uint8 tmp = intel_sub_group_block_read8(src,coord);
+ uint8 tmp = intel_sub_group_block_read_ui8(src,coord);
dst[id] = tmp;
}
+#ifdef SHORT
+__kernel void compiler_subgroup_image_block_read_us1(image2d_t src, global ushort *dst)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ ushort tmp = intel_sub_group_block_read_us(src,coord);
+ dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_image_block_read_us2(image2d_t src, global ushort2 *dst)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ ushort2 tmp = intel_sub_group_block_read_us2(src,coord);
+ dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_image_block_read_us4(image2d_t src, global ushort4 *dst)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ ushort4 tmp = intel_sub_group_block_read_us4(src,coord);
+ dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_image_block_read_us8(image2d_t src, global ushort8 *dst)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ ushort8 tmp = intel_sub_group_block_read_us8(src,coord);
+ dst[id] = tmp;
+}
+#endif
diff --git a/kernels/compiler_subgroup_image_block_write.cl b/kernels/compiler_subgroup_image_block_write.cl
index d9b3717..7d97c59 100644
--- a/kernels/compiler_subgroup_image_block_write.cl
+++ b/kernels/compiler_subgroup_image_block_write.cl
@@ -1,27 +1,55 @@
-__kernel void compiler_subgroup_image_block_write1(image2d_t dst, global uint *src)
+__kernel void compiler_subgroup_image_block_write_ui1(image2d_t dst, global uint *src)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- intel_sub_group_block_write(dst,coord, src[id]);
+ intel_sub_group_block_write_ui(dst,coord, src[id]);
}
-
-__kernel void compiler_subgroup_image_block_write2(image2d_t dst, global uint2 *src)
+__kernel void compiler_subgroup_image_block_write_ui2(image2d_t dst, global uint2 *src)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- intel_sub_group_block_write2(dst,coord, src[id]);
+ intel_sub_group_block_write_ui2(dst,coord, src[id]);
}
-__kernel void compiler_subgroup_image_block_write4(image2d_t dst, global uint4 *src)
+__kernel void compiler_subgroup_image_block_write_ui4(image2d_t dst, global uint4 *src)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- intel_sub_group_block_write4(dst,coord, src[id]);
+ intel_sub_group_block_write_ui4(dst,coord, src[id]);
}
-__kernel void compiler_subgroup_image_block_write8(image2d_t dst, global uint8 *src)
+__kernel void compiler_subgroup_image_block_write_ui8(image2d_t dst, global uint8 *src)
{
int id = get_global_id(0);
int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
- intel_sub_group_block_write8(dst,coord, src[id]);
+ intel_sub_group_block_write_ui8(dst,coord, src[id]);
+}
+#ifdef SHORT
+__kernel void compiler_subgroup_image_block_write_us1(image2d_t dst, global ushort *src)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ intel_sub_group_block_write_us(dst,coord, src[id]);
+}
+
+__kernel void compiler_subgroup_image_block_write_us2(image2d_t dst, global ushort2 *src)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ intel_sub_group_block_write_us2(dst,coord, src[id]);
+}
+
+__kernel void compiler_subgroup_image_block_write_us4(image2d_t dst, global ushort4 *src)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ intel_sub_group_block_write_us4(dst,coord, src[id]);
+}
+
+__kernel void compiler_subgroup_image_block_write_us8(image2d_t dst, global ushort8 *src)
+{
+ int id = get_global_id(0);
+ int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+ intel_sub_group_block_write_us8(dst,coord, src[id]);
}
+#endif
diff --git a/kernels/compiler_subgroup_reduce.cl b/kernels/compiler_subgroup_reduce.cl
index 77ffb07..79d8e7d 100644
--- a/kernels/compiler_subgroup_reduce.cl
+++ b/kernels/compiler_subgroup_reduce.cl
@@ -1,6 +1,7 @@
/*
* Subgroup any all functions
*/
+#ifndef HALF
kernel void compiler_subgroup_any(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int predicate = sub_group_any(val);
@@ -72,6 +73,17 @@ kernel void compiler_subgroup_reduce_add_float(global float *src, global float *
/*
* Subgroup reduce max functions
*/
+kernel void compiler_subgroup_reduce_max_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_reduce_max_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ //printf("src is %d\n",val);
+ ushort sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
kernel void compiler_subgroup_reduce_max_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_reduce_max(val);
@@ -105,6 +117,17 @@ kernel void compiler_subgroup_reduce_max_float(global float *src, global float *
/*
* Subgroup reduce min functions
*/
+kernel void compiler_subgroup_reduce_min_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_reduce_min_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ //printf("src is %d\n",val);
+ ushort sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
kernel void compiler_subgroup_reduce_min_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_reduce_min(val);
@@ -134,3 +157,21 @@ kernel void compiler_subgroup_reduce_min_float(global float *src, global float *
float sum = sub_group_reduce_min(val);
dst[get_global_id(0)] = sum;
}
+#else
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_reduce_add_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_reduce_max_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_reduce_min_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
+#endif
diff --git a/kernels/compiler_subgroup_scan_exclusive.cl b/kernels/compiler_subgroup_scan_exclusive.cl
index afc00d0..2c4b928 100644
--- a/kernels/compiler_subgroup_scan_exclusive.cl
+++ b/kernels/compiler_subgroup_scan_exclusive.cl
@@ -1,6 +1,19 @@
/*
* Subgroup scan exclusive add functions
*/
+#ifndef HALF
+kernel void compiler_subgroup_scan_exclusive_add_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ ushort sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
kernel void compiler_subgroup_scan_exclusive_add_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_scan_exclusive_add(val);
@@ -34,6 +47,18 @@ kernel void compiler_subgroup_scan_exclusive_add_float(global float *src, global
/*
* Subgroup scan exclusive max functions
*/
+kernel void compiler_subgroup_scan_exclusive_max_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ ushort sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
kernel void compiler_subgroup_scan_exclusive_max_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_scan_exclusive_max(val);
@@ -67,6 +92,18 @@ kernel void compiler_subgroup_scan_exclusive_max_float(global float *src, global
/*
* Subgroup scan exclusive min functions
*/
+kernel void compiler_subgroup_scan_exclusive_min_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ ushort sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
kernel void compiler_subgroup_scan_exclusive_min_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_scan_exclusive_min(val);
@@ -96,3 +133,21 @@ kernel void compiler_subgroup_scan_exclusive_min_float(global float *src, global
float sum = sub_group_scan_exclusive_min(val);
dst[get_global_id(0)] = sum;
}
+#else
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_scan_exclusive_add_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_scan_exclusive_max_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_scan_exclusive_min_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+#endif
diff --git a/kernels/compiler_subgroup_scan_inclusive.cl b/kernels/compiler_subgroup_scan_inclusive.cl
index da1a6e6..def941c 100644
--- a/kernels/compiler_subgroup_scan_inclusive.cl
+++ b/kernels/compiler_subgroup_scan_inclusive.cl
@@ -1,6 +1,19 @@
/*
* Subgroup scan inclusive add functions
*/
+#ifndef HALF
+kernel void compiler_subgroup_scan_inclusive_add_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ ushort sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
kernel void compiler_subgroup_scan_inclusive_add_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_scan_inclusive_add(val);
@@ -34,6 +47,18 @@ kernel void compiler_subgroup_scan_inclusive_add_float(global float *src, global
/*
* Subgroup scan inclusive max functions
*/
+kernel void compiler_subgroup_scan_inclusive_max_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ ushort sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
kernel void compiler_subgroup_scan_inclusive_max_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_scan_inclusive_max(val);
@@ -67,6 +92,18 @@ kernel void compiler_subgroup_scan_inclusive_max_float(global float *src, global
/*
* Subgroup scan inclusive min functions
*/
+kernel void compiler_subgroup_scan_inclusive_min_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ ushort sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
kernel void compiler_subgroup_scan_inclusive_min_int(global int *src, global int *dst) {
int val = src[get_global_id(0)];
int sum = sub_group_scan_inclusive_min(val);
@@ -96,3 +133,21 @@ kernel void compiler_subgroup_scan_inclusive_min_float(global float *src, global
float sum = sub_group_scan_inclusive_min(val);
dst[get_global_id(0)] = sum;
}
+#else
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_scan_inclusive_add_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_scan_inclusive_max_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_scan_inclusive_min_half(global half *src, global half *dst) {
+ half val = src[get_global_id(0)];
+ half sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+#endif
diff --git a/src/Android.mk b/src/Android.mk
index 9b63f7e..c195988 100644
--- a/src/Android.mk
+++ b/src/Android.mk
@@ -84,12 +84,23 @@ $(shell $(LOCAL_PATH)/git_sha1.sh $(LOCAL_PATH) ${GIT_SHA1})
LOCAL_SRC_FILES:= \
$(addprefix kernels/,$(addsuffix _str.c, $(KERNEL_NAMES))) \
$(addprefix kernels/,$(addsuffix _str.c, $(BUILT_IN_NAME))) \
+ cl_base_object.c \
cl_api.c \
+ cl_api_platform_id.c \
+ cl_api_device_id.c \
+ cl_api_mem.c \
+ cl_api_kernel.c \
+ cl_api_command_queue.c \
+ cl_api_event.c \
+ cl_api_context.c \
+ cl_api_sampler.c \
+ cl_api_program.c \
cl_alloc.c \
cl_kernel.c \
cl_program.c \
cl_gbe_loader.cpp \
cl_sampler.c \
+ cl_accelerator_intel.c \
cl_event.c \
cl_enqueue.c \
cl_image.c \
@@ -101,15 +112,16 @@ LOCAL_SRC_FILES:= \
cl_command_queue.c \
cl_command_queue.h \
cl_command_queue_gen7.c \
- cl_thread.c \
+ cl_command_queue_enqueue.c \
+ cl_device_enqueue.c \
+ cl_utils.c \
cl_driver.h \
cl_driver.cpp \
cl_driver_defs.c \
intel/intel_gpgpu.c \
intel/intel_batchbuffer.c \
intel/intel_driver.c \
- performance.c \
- cl_accelerator_intel.c
+ performance.c
LOCAL_SHARED_LIBRARIES := \
libgbe \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 82be7ff..f3c4632 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,8 +3,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
${DRM_INCLUDE_DIRS}/../
${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
${CMAKE_CURRENT_SOURCE_DIR}/../include
- ${MESA_SOURCE_INCLUDES}
- ${LLVM_INCLUDE_DIR})
+ ${LLVM_INCLUDE_DIR}
+ ${OPENGL_INCLUDE_DIRS}
+ ${EGL_INCLUDE_DIRS})
macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
foreach (KF ${KERNEL_FILES})
@@ -65,7 +66,17 @@ MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${BUILT_IN_NAME}")
set(OPENCL_SRC
${KERNEL_STR_FILES}
+ cl_base_object.c
cl_api.c
+ cl_api_platform_id.c
+ cl_api_device_id.c
+ cl_api_mem.c
+ cl_api_kernel.c
+ cl_api_command_queue.c
+ cl_api_event.c
+ cl_api_context.c
+ cl_api_sampler.c
+ cl_api_program.c
cl_alloc.c
cl_kernel.c
cl_program.c
@@ -82,8 +93,11 @@ set(OPENCL_SRC
cl_context.c
cl_command_queue.c
cl_command_queue.h
+ cl_device_enqueue.c
+ cl_device_enqueue.h
cl_command_queue_gen7.c
- cl_thread.c
+ cl_command_queue_enqueue.c
+ cl_utils.c
cl_driver.h
cl_driver.cpp
cl_driver_defs.c
@@ -108,14 +122,11 @@ if (CMRT_FOUND)
set(OPENCL_SRC ${OPENCL_SRC} cl_cmrt.cpp)
endif (CMRT_FOUND)
-if (EGL_FOUND AND MESA_SOURCE_FOUND)
- set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
- SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
- SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
- SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
-else(EGL_FOUND AND MESA_SOURCE_FOUND)
- SET(OPTIONAL_EGL_LIBRARY "")
-endif (EGL_FOUND AND MESA_SOURCE_FOUND)
+if (OPENGL_FOUND AND EGL_FOUND)
+ set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c )
+ SET(CMAKE_CXX_FLAGS "-DHAS_GL_EGL ${CMAKE_CXX_FLAGS}")
+ SET(CMAKE_C_FLAGS "-DHAS_GL_EGL ${CMAKE_C_FLAGS}")
+endif (OPENGL_FOUND AND EGL_FOUND)
if (OCLIcd_FOUND)
set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
@@ -148,6 +159,11 @@ if (HAVE_DRM_INTEL_MIN_EU_IN_POOL)
SET(CMAKE_C_FLAGS "-DHAS_MIN_EU_IN_POOL ${CMAKE_C_FLAGS}")
endif (HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+if (HAVE_DRM_INTEL_BO_SET_SOFTPIN)
+ SET(CMAKE_CXX_FLAGS "-DHAS_BO_SET_SOFTPIN ${CMAKE_CXX_FLAGS}")
+ SET(CMAKE_C_FLAGS "-DHAS_BO_SET_SOFTPIN ${CMAKE_C_FLAGS}")
+endif (HAVE_DRM_INTEL_BO_SET_SOFTPIN)
+
set(GIT_SHA1 "git_sha1.h")
add_custom_target(${GIT_SHA1} ALL
COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
@@ -156,7 +172,7 @@ add_custom_target(${GIT_SHA1} ALL
SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
-link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR} ${OPENGL_LIBDIR} ${EGL_LIBDIR})
add_library(cl SHARED ${OPENCL_SRC})
ADD_DEPENDENCIES(cl ${GIT_SHA1})
target_link_libraries(
@@ -170,5 +186,5 @@ target_link_libraries(
${CMAKE_THREAD_LIBS_INIT}
${CMAKE_DL_LIBS}
${OPENGL_LIBRARIES}
- ${OPTIONAL_EGL_LIBRARY})
+ ${EGL_LIBRARIES})
install (TARGETS cl LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c
index cda8963..ae08184 100644
--- a/src/cl_accelerator_intel.c
+++ b/src/cl_accelerator_intel.c
@@ -19,9 +19,7 @@ cl_accelerator_intel_new(cl_context ctx,
/* Allocate and inialize the structure itself */
TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel));
- SET_ICD(accel->dispatch)
- accel->ref_n = 1;
- accel->magic = CL_MAGIC_ACCELERATOR_INTEL_HEADER;
+ CL_OBJECT_INIT_BASE(accel, CL_OBJECT_ACCELERATOR_INTEL_MAGIC);
if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) {
err = CL_INVALID_ACCELERATOR_TYPE_INTEL;
@@ -37,12 +35,12 @@ cl_accelerator_intel_new(cl_context ctx,
/* Append the accelerator_intel in the context accelerator_intel list */
/* does this really needed? */
- pthread_mutex_lock(&ctx->accelerator_intel_lock);
+ CL_OBJECT_LOCK(ctx);
accel->next = ctx->accels;
if (ctx->accels != NULL)
ctx->accels->prev = accel;
ctx->accels = accel;
- pthread_mutex_unlock(&ctx->accelerator_intel_lock);
+ CL_OBJECT_UNLOCK(ctx);
accel->ctx = ctx;
cl_context_add_ref(ctx);
@@ -60,7 +58,7 @@ error:
LOCAL void
cl_accelerator_intel_add_ref(cl_accelerator_intel accel)
{
- atomic_inc(&accel->ref_n);
+ CL_OBJECT_INC_REF(accel);
}
LOCAL void
@@ -68,19 +66,20 @@ cl_accelerator_intel_delete(cl_accelerator_intel accel)
{
if (UNLIKELY(accel == NULL))
return;
- if (atomic_dec(&accel->ref_n) > 1)
+ if (CL_OBJECT_DEC_REF(accel) > 1)
return;
/* Remove the accelerator_intel in the context accelerator_intel list */
- pthread_mutex_lock(&accel->ctx->accelerator_intel_lock);
+ CL_OBJECT_LOCK(accel->ctx);
if (accel->prev)
accel->prev->next = accel->next;
if (accel->next)
accel->next->prev = accel->prev;
if (accel->ctx->accels == accel)
accel->ctx->accels = accel->next;
- pthread_mutex_unlock(&accel->ctx->accelerator_intel_lock);
+ CL_OBJECT_UNLOCK(accel->ctx);
cl_context_delete(accel->ctx);
+ CL_OBJECT_DESTROY_BASE(accel);
cl_free(accel);
}
diff --git a/src/cl_accelerator_intel.h b/src/cl_accelerator_intel.h
index cecfd2a..435ae73 100644
--- a/src/cl_accelerator_intel.h
+++ b/src/cl_accelerator_intel.h
@@ -1,22 +1,25 @@
#ifndef __CL_ACCELERATOR_INTEL_H__
#define __CL_ACCELERATOR_INTEL_H__
+#include "cl_base_object.h"
#include "CL/cl.h"
#include "CL/cl_ext.h"
#include <stdint.h>
struct _cl_accelerator_intel {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a accelerator_intel object */
- volatile int ref_n; /* This object is reference counted */
+ _cl_base_object base;
cl_accelerator_intel prev, next; /* We chain in the allocator, why chain? */
cl_context ctx; /* Context it belongs to */
cl_accelerator_type_intel type;
union {
cl_motion_estimation_desc_intel me;
- }desc; /* save desc before we decide how to handle it */
+ } desc; /* save desc before we decide how to handle it */
};
+#define CL_OBJECT_ACCELERATOR_INTEL_MAGIC 0x7e6a08c9a7ac3e3fLL
+#define CL_OBJECT_IS_ACCELERATOR_INTEL(obj) \
+ (((cl_base_object)obj)->magic == CL_OBJECT_ACCELERATOR_INTEL_MAGIC)
+
cl_accelerator_intel cl_accelerator_intel_new(cl_context ctx,
cl_accelerator_type_intel accel_type,
size_t desc_sz,
diff --git a/src/cl_api.c b/src/cl_api.c
index a7c78f0..24b8b3d 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -67,117 +67,6 @@ typedef intptr_t cl_device_partition_property;
return RET; \
} while(0)
-static inline cl_int
-handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
- cl_event* event, enqueue_data* data, cl_command_type type)
-{
- cl_int status = cl_event_wait_events(num, wait_list, queue);
- cl_event e = NULL;
- if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
- e = cl_event_new(queue->ctx, queue, type, event!=NULL);
-
- /* if need profiling, add the submit timestamp here. */
- if (e->type != CL_COMMAND_USER &&
- e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
- cl_event_get_queued_cpu_timestamp(e);
- }
-
- if(event != NULL)
- *event = e;
- if(status == CL_ENQUEUE_EXECUTE_DEFER) {
- cl_event_new_enqueue_callback(e, data, num, wait_list);
- }
- }
- set_current_event(queue, e);
- return status;
-}
-
-/* The following code checking overlap is from Appendix of openCL spec 1.1 */
-cl_bool check_copy_overlap(const size_t src_offset[3],
- const size_t dst_offset[3],
- const size_t region[3],
- size_t row_pitch, size_t slice_pitch)
-{
- const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
- const size_t src_max[] = {src_offset[0] + region[0],
- src_offset[1] + region[1],
- src_offset[2] + region[2]};
- const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
- const size_t dst_max[] = {dst_offset[0] + region[0],
- dst_offset[1] + region[1],
- dst_offset[2] + region[2]};
- // Check for overlap
- cl_bool overlap = CL_TRUE;
- unsigned i;
- size_t dst_start = dst_offset[2] * slice_pitch +
- dst_offset[1] * row_pitch + dst_offset[0];
- size_t dst_end = dst_start + (region[2] * slice_pitch +
- region[1] * row_pitch + region[0]);
- size_t src_start = src_offset[2] * slice_pitch +
- src_offset[1] * row_pitch + src_offset[0];
- size_t src_end = src_start + (region[2] * slice_pitch +
- region[1] * row_pitch + region[0]);
-
- for (i=0; i != 3; ++i) {
- overlap = overlap && (src_min[i] < dst_max[i])
- && (src_max[i] > dst_min[i]);
- }
-
- if (!overlap) {
- size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
- src_offset[0] + region[0] - row_pitch : 0;
- size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
- dst_offset[0] + region[0] - row_pitch : 0;
- if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
- (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
- if ( (src_start <= dst_start && dst_start < src_end) ||
- (dst_start <= src_start && src_start < dst_end) )
- overlap = CL_TRUE;
- }
- if (region[2] > 1) {
- size_t src_height = slice_pitch / row_pitch;
- size_t dst_height = slice_pitch / row_pitch;
- size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
- src_offset[1] + region[1] - src_height : 0;
- size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
- dst_offset[1] + region[1] - dst_height : 0;
- if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
- (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
- if ( (src_start <= dst_start && dst_start < src_end) ||
- (dst_start <= src_start && src_start < dst_end) )
- overlap = CL_TRUE;
- }
- }
- }
- return overlap;
-}
-
-static cl_int
-cl_check_device_type(cl_device_type device_type)
-{
- const cl_device_type valid = CL_DEVICE_TYPE_GPU
- | CL_DEVICE_TYPE_CPU
- | CL_DEVICE_TYPE_ACCELERATOR
- | CL_DEVICE_TYPE_DEFAULT
- | CL_DEVICE_TYPE_CUSTOM;
-
- if( (device_type & valid) == 0) {
- return CL_INVALID_DEVICE_TYPE;
- }
- if(UNLIKELY(!(device_type & CL_DEVICE_TYPE_DEFAULT) && !(device_type & CL_DEVICE_TYPE_GPU)))
- return CL_DEVICE_NOT_FOUND;
-
- return CL_SUCCESS;
-}
-
-static cl_int
-cl_device_id_is_ok(const cl_device_id device)
-{
- if(UNLIKELY(device == NULL)) return CL_FALSE;
- return device != cl_get_gt_device() ? CL_FALSE : CL_TRUE;
-}
-
cl_int
clGetPlatformIDs(cl_uint num_entries,
cl_platform_id * platforms,
@@ -191,299 +80,6 @@ clGetPlatformIDs(cl_uint num_entries,
return cl_get_platform_ids(num_entries, platforms, num_platforms);
}
-cl_int
-clGetPlatformInfo(cl_platform_id platform,
- cl_platform_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- /* Only one platform. This is easy */
- if (UNLIKELY(platform != NULL && platform != cl_get_platform_default()))
- return CL_INVALID_PLATFORM;
-
- return cl_get_platform_info(platform,
- param_name,
- param_value_size,
- param_value,
- param_value_size_ret);
-}
-
-cl_int
-clGetDeviceIDs(cl_platform_id platform,
- cl_device_type device_type,
- cl_uint num_entries,
- cl_device_id * devices,
- cl_uint * num_devices)
-{
- cl_int err = CL_SUCCESS;
-
- /* Check parameter consistency */
- if (UNLIKELY(devices == NULL && num_devices == NULL))
- return CL_INVALID_VALUE;
- if (UNLIKELY(platform && platform != cl_get_platform_default()))
- return CL_INVALID_PLATFORM;
- if (UNLIKELY(devices && num_entries == 0))
- return CL_INVALID_VALUE;
-
- err = cl_check_device_type(device_type);
- if(err != CL_SUCCESS)
- return err;
-
- return cl_get_device_ids(platform,
- device_type,
- num_entries,
- devices,
- num_devices);
-}
-
-cl_int
-clGetDeviceInfo(cl_device_id device,
- cl_device_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- return cl_get_device_info(device,
- param_name,
- param_value_size,
- param_value,
- param_value_size_ret);
-}
-
-cl_int
-clCreateSubDevices(cl_device_id in_device,
- const cl_device_partition_property * properties,
- cl_uint num_devices,
- cl_device_id * out_devices,
- cl_uint * num_devices_ret)
-{
- /* Check parameter consistency */
- if (UNLIKELY(out_devices == NULL && num_devices_ret == NULL))
- return CL_INVALID_VALUE;
- if (UNLIKELY(in_device == NULL && properties == NULL))
- return CL_INVALID_VALUE;
-
- *num_devices_ret = 0;
- return CL_INVALID_DEVICE_PARTITION_COUNT;
-}
-
-cl_int
-clRetainDevice(cl_device_id device)
-{
- // XXX stub for C++ Bindings
- return CL_SUCCESS;
-}
-
-cl_int
-clReleaseDevice(cl_device_id device)
-{
-#ifdef HAS_CMRT
- if (device->cmrt_device != NULL)
- cmrt_destroy_device(device);
-#endif
-
- // XXX stub for C++ Bindings
- return CL_SUCCESS;
-}
-
-cl_context
-clCreateContext(const cl_context_properties * properties,
- cl_uint num_devices,
- const cl_device_id * devices,
- void (* pfn_notify) (const char*, const void*, size_t, void*),
- void * user_data,
- cl_int * errcode_ret)
-{
- cl_int err = CL_SUCCESS;
- cl_context context = NULL;
-
- /* Assert parameters correctness */
- INVALID_VALUE_IF (devices == NULL);
- INVALID_VALUE_IF (num_devices == 0);
- INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
-
- /* Now check if the user is asking for the right device */
- INVALID_DEVICE_IF (cl_device_id_is_ok(*devices) == CL_FALSE);
-
- context = cl_create_context(properties,
- num_devices,
- devices,
- pfn_notify,
- user_data,
- &err);
- initialize_env_var();
-error:
- if (errcode_ret)
- *errcode_ret = err;
- return context;
-}
-
-cl_context
-clCreateContextFromType(const cl_context_properties * properties,
- cl_device_type device_type,
- void (CL_CALLBACK *pfn_notify) (const char *, const void *, size_t, void *),
- void * user_data,
- cl_int * errcode_ret)
-{
- cl_context context = NULL;
- cl_int err = CL_SUCCESS;
- cl_device_id devices[1];
- cl_uint num_devices = 1;
-
- INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
-
- err = cl_check_device_type(device_type);
- if(err != CL_SUCCESS) {
- goto error;
- }
-
- err = cl_get_device_ids(NULL,
- device_type,
- 1,
- &devices[0],
- &num_devices);
- if (err != CL_SUCCESS) {
- goto error;
- }
-
- context = cl_create_context(properties,
- num_devices,
- devices,
- pfn_notify,
- user_data,
- &err);
-error:
- if (errcode_ret)
- *errcode_ret = err;
- return context;
-}
-
-cl_int
-clRetainContext(cl_context context)
-{
- cl_int err = CL_SUCCESS;
- CHECK_CONTEXT (context);
- cl_context_add_ref(context);
-error:
- return err;
-}
-
-cl_int
-clReleaseContext(cl_context context)
-{
- cl_int err = CL_SUCCESS;
- CHECK_CONTEXT (context);
- cl_context_delete(context);
-error:
- return err;
-}
-
-cl_int
-clGetContextInfo(cl_context context,
- cl_context_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- cl_int err = CL_SUCCESS;
- CHECK_CONTEXT (context);
-
- if (param_name == CL_CONTEXT_DEVICES) {
- FILL_GETINFO_RET (cl_device_id, 1, &context->device, CL_SUCCESS);
- } else if (param_name == CL_CONTEXT_NUM_DEVICES) {
- cl_uint n = 1;
- FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
- } else if (param_name == CL_CONTEXT_REFERENCE_COUNT) {
- cl_uint ref = context->ref_n;
- FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
- } else if (param_name == CL_CONTEXT_PROPERTIES) {
- if(context->prop_len > 0) {
- FILL_GETINFO_RET (cl_context_properties, context->prop_len, context->prop_user, CL_SUCCESS);
- } else {
- cl_context_properties n = 0;
- FILL_GETINFO_RET (cl_context_properties, 1, &n, CL_SUCCESS);
- }
- } else {
- return CL_INVALID_VALUE;
- }
-
-error:
- return err;
-}
-
-cl_command_queue
-clCreateCommandQueue(cl_context context,
- cl_device_id device,
- cl_command_queue_properties properties,
- cl_int * errcode_ret)
-{
- cl_command_queue queue = NULL;
- cl_int err = CL_SUCCESS;
- CHECK_CONTEXT (context);
-
- INVALID_DEVICE_IF (device != context->device);
- INVALID_VALUE_IF (properties & ~(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE));
-
- if(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {/*not supported now.*/
- err = CL_INVALID_QUEUE_PROPERTIES;
- goto error;
- }
-
- queue = cl_context_create_queue(context, device, properties, &err);
-error:
- if (errcode_ret)
- *errcode_ret = err;
- return queue;
-}
-
-cl_int
-clRetainCommandQueue(cl_command_queue command_queue)
-{
- cl_int err = CL_SUCCESS;
- CHECK_QUEUE (command_queue);
- cl_command_queue_add_ref(command_queue);
-error:
- return err;
-}
-
-cl_int
-clReleaseCommandQueue(cl_command_queue command_queue)
-{
- cl_int err = CL_SUCCESS;
- CHECK_QUEUE (command_queue);
- cl_command_queue_delete(command_queue);
-error:
- return err;
-}
-
-cl_int
-clGetCommandQueueInfo(cl_command_queue command_queue,
- cl_command_queue_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- cl_int err = CL_SUCCESS;
- CHECK_QUEUE (command_queue);
-
- if (param_name == CL_QUEUE_CONTEXT) {
- FILL_GETINFO_RET (cl_context, 1, &command_queue->ctx, CL_SUCCESS);
- } else if (param_name == CL_QUEUE_DEVICE) {
- FILL_GETINFO_RET (cl_device_id, 1, &command_queue->ctx->device, CL_SUCCESS);
- } else if (param_name == CL_QUEUE_REFERENCE_COUNT) {
- cl_uint ref = command_queue->ref_n;
- FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
- } else if (param_name == CL_QUEUE_PROPERTIES) {
- FILL_GETINFO_RET (cl_command_queue_properties, 1, &command_queue->props, CL_SUCCESS);
- } else {
- return CL_INVALID_VALUE;
- }
-
-error:
- return err;
-}
-
cl_mem
clCreateBuffer(cl_context context,
cl_mem_flags flags,
@@ -538,7 +134,7 @@ clCreateImage(cl_context context,
goto error;
}
if (image_format->image_channel_order < CL_R ||
- image_format->image_channel_order > CL_RGBx) {
+ image_format->image_channel_order > CL_sBGRA) {
err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
goto error;
}
@@ -583,249 +179,528 @@ error:
return mem;
}
-cl_mem
-clCreateImage2D(cl_context context,
- cl_mem_flags flags,
- const cl_image_format * image_format,
- size_t image_width,
- size_t image_height,
- size_t image_row_pitch,
- void * host_ptr,
- cl_int * errcode_ret)
+void *
+clSVMAlloc (cl_context context,
+ cl_svm_mem_flags flags,
+ size_t size,
+ unsigned int alignment)
{
- cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
- cl_image_desc image_desc;
- memset(&image_desc, 0, sizeof(image_desc));
-
- image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
- image_desc.image_width = image_width;
- image_desc.image_height = image_height;
- image_desc.image_row_pitch = image_row_pitch;
-
- mem = cl_mem_new_image(context,
- flags,
- image_format,
- &image_desc,
- host_ptr,
- &err);
+ (void) err;
+ return cl_mem_svm_allocate(context, flags, size, alignment);
error:
- if (errcode_ret)
- *errcode_ret = err;
- return mem;
+ return NULL;
}
-cl_mem
-clCreateImage3D(cl_context context,
- cl_mem_flags flags,
- const cl_image_format * image_format,
- size_t image_width,
- size_t image_height,
- size_t image_depth,
- size_t image_row_pitch,
- size_t image_slice_pitch,
- void * host_ptr,
- cl_int * errcode_ret)
+void
+clSVMFree (cl_context context, void* svm_pointer)
{
- cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
- cl_image_desc image_desc;
-
- image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
- image_desc.image_width = image_width;
- image_desc.image_height = image_height;
- image_desc.image_depth = image_depth;
- image_desc.image_row_pitch = image_row_pitch;
- image_desc.image_slice_pitch = image_slice_pitch;
-
- mem = cl_mem_new_image(context,
- flags,
- image_format,
- &image_desc,
- host_ptr,
- &err);
+ (void) err;
+ return cl_mem_svm_delete(context, svm_pointer);
error:
- if (errcode_ret)
- *errcode_ret = err;
- return mem;
+ return;
}
cl_int
-clRetainMemObject(cl_mem memobj)
+clEnqueueSVMFree (cl_command_queue command_queue,
+ cl_uint num_svm_pointers,
+ void *svm_pointers[],
+ void (CL_CALLBACK *pfn_free_func)( cl_command_queue queue,
+ cl_uint num_svm_pointers,
+ void *svm_pointers[],
+ void *user_data),
+ void *user_data,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
cl_int err = CL_SUCCESS;
- CHECK_MEM (memobj);
- cl_mem_add_ref(memobj);
-error:
+ cl_int i = 0;
+ void** pointers = NULL;
+ cl_event e = NULL;
+ cl_int e_status;
+ enqueue_data *data;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if(num_svm_pointers == 0 || svm_pointers == NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ for(i=0; i<num_svm_pointers; i++) {
+ if(svm_pointers[i] == NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ }
+ if(err != CL_SUCCESS)
+ break;
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_SVM_FREE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ pointers = malloc(num_svm_pointers * sizeof(void *));
+ if(UNLIKELY(pointers == NULL)) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ break;
+ }
+ memcpy(pointers, svm_pointers, num_svm_pointers * sizeof(void *));
+
+ data = &e->exec_data;
+ data->type = EnqueueSVMFree;
+ data->queue = command_queue;
+ data->pointers = pointers;
+ data->free_func = pfn_free_func;
+ data->size = num_svm_pointers;
+ data->ptr = user_data;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
return err;
}
cl_int
-clReleaseMemObject(cl_mem memobj)
+clEnqueueSVMMap (cl_command_queue command_queue,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ void *svm_ptr,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
cl_int err = CL_SUCCESS;
- CHECK_MEM (memobj);
- cl_mem_delete(memobj);
+ cl_mem buffer;
+
+ CHECK_QUEUE(command_queue);
+ buffer = cl_context_get_svm_from_ptr(command_queue->ctx, svm_ptr);
+ if(buffer == NULL) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags, 0, size,
+ num_events_in_wait_list, event_wait_list, event, &err);
+ if(event)
+ (*event)->event_type = CL_COMMAND_SVM_MAP;
error:
return err;
}
cl_int
-clGetSupportedImageFormats(cl_context ctx,
- cl_mem_flags flags,
- cl_mem_object_type image_type,
- cl_uint num_entries,
- cl_image_format * image_formats,
- cl_uint * num_image_formats)
+clEnqueueSVMUnmap (cl_command_queue command_queue,
+ void *svm_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
cl_int err = CL_SUCCESS;
- CHECK_CONTEXT (ctx);
- if (UNLIKELY(num_entries == 0 && image_formats != NULL)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
- if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
- image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
- image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
- image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
- image_type != CL_MEM_OBJECT_IMAGE2D &&
- image_type != CL_MEM_OBJECT_IMAGE3D)) {
+ cl_mem buffer;
+
+ CHECK_QUEUE(command_queue);
+ buffer = cl_context_get_svm_from_ptr(command_queue->ctx, svm_ptr);
+ if(buffer == NULL) {
err = CL_INVALID_VALUE;
goto error;
}
- err = cl_image_get_supported_fmt(ctx,
- image_type,
- num_entries,
- image_formats,
- num_image_formats);
+
+ err = clEnqueueUnmapMemObject(command_queue, buffer, svm_ptr,
+ num_events_in_wait_list, event_wait_list, event);
+ if(event)
+ (*event)->event_type = CL_COMMAND_SVM_UNMAP;
error:
return err;
}
-cl_int
-clGetMemObjectInfo(cl_mem memobj,
- cl_mem_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
+cl_int clEnqueueSVMMemcpy (cl_command_queue command_queue,
+ cl_bool blocking_copy,
+ void *dst_ptr,
+ const void *src_ptr,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
cl_int err = CL_SUCCESS;
- CHECK_MEM(memobj);
+ enqueue_data *data;
+ cl_int e_status;
+ cl_event e = NULL;
- err = cl_get_mem_object_info(memobj,
- param_name,
- param_value_size,
- param_value,
- param_value_size_ret);
-error:
- return err;
-}
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
-cl_int
-clGetImageInfo(cl_mem mem,
- cl_image_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- return cl_get_image_info(mem,
- param_name,
- param_value_size,
- param_value,
- param_value_size_ret);
+ if(UNLIKELY(dst_ptr == NULL || src_ptr == NULL || size == 0 )) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if(((size_t)src_ptr < (size_t)dst_ptr && ((size_t)src_ptr + size > (size_t)dst_ptr)) ||
+ ((size_t)dst_ptr < (size_t)src_ptr && ((size_t)dst_ptr + size > (size_t)src_ptr))) {
+ err = CL_MEM_COPY_OVERLAP;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_SVM_MEMCPY, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_copy) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueSVMMemCopy;
+ data->queue = command_queue;
+ data->ptr = dst_ptr;
+ data->const_ptr = src_ptr;
+ data->size = size;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while(0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
}
-cl_int
-clSetMemObjectDestructorCallback(cl_mem memobj,
- void (CL_CALLBACK *pfn_notify) (cl_mem, void*),
- void * user_data)
+cl_int clEnqueueSVMMemFill (cl_command_queue command_queue,
+ void *svm_ptr,
+ const void *pattern,
+ size_t pattern_size,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
{
cl_int err = CL_SUCCESS;
- CHECK_MEM(memobj);
- INVALID_VALUE_IF (pfn_notify == 0);
+ enqueue_data *data;
+ cl_int e_status;
+ cl_event e = NULL;
- cl_mem_dstr_cb *cb = (cl_mem_dstr_cb*)malloc(sizeof(cl_mem_dstr_cb));
- if (!cb) {
- err = CL_OUT_OF_HOST_MEMORY;
- goto error;
- }
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
- memset(cb, 0, sizeof(cl_mem_dstr_cb));
- cb->pfn_notify = pfn_notify;
- cb->user_data = user_data;
- cb->next = memobj->dstr_cb;
- memobj->dstr_cb = cb;
+ if(UNLIKELY(svm_ptr == NULL ||
+ ((size_t)svm_ptr & (pattern_size - 1)) != 0)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if(UNLIKELY(pattern == NULL ||
+ (pattern_size & (pattern_size - 1)) != 0 ||
+ pattern_size > 128)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if(UNLIKELY(size == 0 ||
+ (size % pattern_size) != 0)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_SVM_MEMFILL, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueSVMMemFill;
+ data->queue = command_queue;
+ data->ptr = svm_ptr;
+ data->const_ptr = pattern;
+ data->pattern_size = pattern_size;
+ data->size = size;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while(0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
-error:
return err;
}
-cl_sampler
-clCreateSampler(cl_context context,
- cl_bool normalized,
- cl_addressing_mode addressing,
- cl_filter_mode filter,
- cl_int * errcode_ret)
+cl_mem
+clCreateImage2D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_row_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
{
- cl_sampler sampler = NULL;
+ cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
- sampler = cl_sampler_new(context, normalized, addressing, filter, &err);
+ cl_image_desc image_desc;
+ memset(&image_desc, 0, sizeof(image_desc));
+
+ image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ image_desc.image_width = image_width;
+ image_desc.image_height = image_height;
+ image_desc.image_row_pitch = image_row_pitch;
+
+ mem = cl_mem_new_image(context,
+ flags,
+ image_format,
+ &image_desc,
+ host_ptr,
+ &err);
error:
if (errcode_ret)
*errcode_ret = err;
- return sampler;
+ return mem;
}
-cl_int
-clRetainSampler(cl_sampler sampler)
+cl_mem
+clCreateImage3D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_depth,
+ size_t image_row_pitch,
+ size_t image_slice_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
{
+ cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
- CHECK_SAMPLER (sampler);
- cl_sampler_add_ref(sampler);
+ CHECK_CONTEXT (context);
+ cl_image_desc image_desc;
+
+ image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+ image_desc.image_width = image_width;
+ image_desc.image_height = image_height;
+ image_desc.image_depth = image_depth;
+ image_desc.image_row_pitch = image_row_pitch;
+ image_desc.image_slice_pitch = image_slice_pitch;
+
+ mem = cl_mem_new_image(context,
+ flags,
+ image_format,
+ &image_desc,
+ host_ptr,
+ &err);
error:
- return err;
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
}
cl_int
-clReleaseSampler(cl_sampler sampler)
+clGetSupportedImageFormats(cl_context ctx,
+ cl_mem_flags flags,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format * image_formats,
+ cl_uint * num_image_formats)
{
cl_int err = CL_SUCCESS;
- CHECK_SAMPLER (sampler);
- cl_sampler_delete(sampler);
+ CHECK_CONTEXT (ctx);
+ if (UNLIKELY(num_entries == 0 && image_formats != NULL)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
+ image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+ image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
+ image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
+ image_type != CL_MEM_OBJECT_IMAGE2D &&
+ image_type != CL_MEM_OBJECT_IMAGE3D)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ err = cl_image_get_supported_fmt(ctx,
+ flags,
+ image_type,
+ num_entries,
+ image_formats,
+ num_image_formats);
+
error:
return err;
}
-cl_int
-clGetSamplerInfo(cl_sampler sampler,
- cl_sampler_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
+cl_sampler
+clCreateSamplerWithProperties(cl_context context,
+ const cl_sampler_properties *sampler_properties,
+ cl_int * errcode_ret)
{
+ cl_sampler sampler = NULL;
cl_int err = CL_SUCCESS;
- CHECK_SAMPLER (sampler);
-
- if (param_name == CL_SAMPLER_REFERENCE_COUNT) {
- FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&sampler->ref_n, CL_SUCCESS);
- } else if (param_name == CL_SAMPLER_CONTEXT) {
- FILL_GETINFO_RET (cl_context, 1, &sampler->ctx, CL_SUCCESS);
- } else if (param_name == CL_SAMPLER_NORMALIZED_COORDS) {
- FILL_GETINFO_RET (cl_bool, 1, &sampler->normalized_coords, CL_SUCCESS);
- } else if (param_name == CL_SAMPLER_ADDRESSING_MODE) {
- FILL_GETINFO_RET (cl_addressing_mode, 1, &sampler->address, CL_SUCCESS);
- } else if (param_name == CL_SAMPLER_FILTER_MODE ) {
- FILL_GETINFO_RET (cl_filter_mode, 1, &sampler->filter, CL_SUCCESS);
- } else{
- return CL_INVALID_VALUE;
+ CHECK_CONTEXT (context);
+ cl_bool normalized = 0xFFFFFFFF;
+ cl_addressing_mode addressing = 0xFFFFFFFF;
+ cl_filter_mode filter = 0xFFFFFFFF;
+ if(sampler_properties)
+ {
+ cl_ulong sam_type;
+ cl_ulong sam_val;
+ cl_uint i;
+ for(i = 0;(sam_type = sampler_properties[i++])!=0;i++)
+ {
+ sam_val = sampler_properties[i];
+ switch(sam_type)
+ {
+ case CL_SAMPLER_NORMALIZED_COORDS:
+ if(normalized != 0xFFFFFFFF)
+ err = CL_INVALID_VALUE;
+ else if(sam_val == CL_TRUE || sam_val == CL_FALSE)
+ normalized = sam_val;
+ else
+ err = CL_INVALID_VALUE;
+ break;
+ case CL_SAMPLER_ADDRESSING_MODE:
+ if(addressing != 0xFFFFFFFF)
+ err = CL_INVALID_VALUE;
+ else if(sam_val == CL_ADDRESS_MIRRORED_REPEAT || sam_val == CL_ADDRESS_REPEAT ||
+ sam_val == CL_ADDRESS_CLAMP_TO_EDGE || sam_val == CL_ADDRESS_CLAMP ||
+ sam_val == CL_ADDRESS_NONE)
+ addressing = sam_val;
+ else
+ err = CL_INVALID_VALUE;
+ break;
+ case CL_SAMPLER_FILTER_MODE:
+ if(filter != 0xFFFFFFFF)
+ err = CL_INVALID_VALUE;
+ else if(sam_val == CL_FILTER_LINEAR || sam_val == CL_FILTER_NEAREST)
+ filter = sam_val;
+ else
+ err = CL_INVALID_VALUE;
+ break;
+ default:
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ }
}
-
+ if(err)
+ goto error;
+ if(normalized == 0xFFFFFFFF) normalized = CL_TRUE;
+ if(addressing == 0xFFFFFFFF) addressing = CL_ADDRESS_CLAMP;
+ if(filter == 0xFFFFFFFF) filter = CL_FILTER_NEAREST;
+ sampler = cl_create_sampler(context, normalized, addressing, filter, &err);
error:
- return err;
+ if (errcode_ret)
+ *errcode_ret = err;
+ return sampler;
}
cl_program
@@ -946,7 +821,10 @@ clBuildProgram(cl_program program,
/* Everything is easy. We only support one device anyway */
if (num_devices != 0) {
assert(program->ctx);
- INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+ err = cl_devices_list_include_check(program->ctx->device_num,
+ program->ctx->devices, num_devices, device_list);
+ if (err)
+ goto error;
}
assert(program->source_type == FROM_LLVM ||
@@ -988,7 +866,10 @@ clCompileProgram(cl_program program ,
/* Everything is easy. We only support one device anyway */
if (num_devices != 0) {
assert(program->ctx);
- INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+ err = cl_devices_list_include_check(program->ctx->device_num,
+ program->ctx->devices, num_devices, device_list);
+ if (err)
+ goto error;
}
/* TODO support create program from binary */
@@ -1027,2232 +908,319 @@ clLinkProgram(cl_context context,
INVALID_VALUE_IF (pfn_notify == 0 && user_data != NULL);
INVALID_VALUE_IF (num_input_programs == 0 && input_programs != NULL);
INVALID_VALUE_IF (num_input_programs != 0 && input_programs == NULL);
- INVALID_VALUE_IF (num_input_programs == 0 && input_programs == NULL);
-
- program = cl_program_link(context, num_input_programs, input_programs, options, &err);
-
- if(program) program->is_built = CL_TRUE;
-
- if (pfn_notify) pfn_notify(program, user_data);
-
-error:
- if (errcode_ret)
- *errcode_ret = err;
- return program;
-}
-
-cl_int
-clUnloadCompiler(void)
-{
- return CL_SUCCESS;
-}
-
-cl_int
-clUnloadPlatformCompiler(cl_platform_id platform)
-{
- return CL_SUCCESS;
-}
-
-cl_int
-clGetProgramInfo(cl_program program,
- cl_program_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- cl_int err = CL_SUCCESS;
- char * ret_str = "";
-
- CHECK_PROGRAM (program);
-
- if (param_name == CL_PROGRAM_REFERENCE_COUNT) {
- cl_uint ref = program->ref_n;
- FILL_GETINFO_RET (cl_uint, 1, (&ref), CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_CONTEXT) {
- cl_context context = program->ctx;
- FILL_GETINFO_RET (cl_context, 1, &context, CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
- cl_uint num_dev = 1; // Just 1 dev now.
- FILL_GETINFO_RET (cl_uint, 1, &num_dev, CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_DEVICES) {
- cl_device_id dev_id = program->ctx->device;
- FILL_GETINFO_RET (cl_device_id, 1, &dev_id, CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
- cl_uint kernels_num = program->ker_n;
- FILL_GETINFO_RET (cl_uint, 1, &kernels_num, CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_SOURCE) {
-
- if (!program->source)
- FILL_GETINFO_RET (char, 1, &ret_str, CL_SUCCESS);
- FILL_GETINFO_RET (char, (strlen(program->source) + 1),
- program->source, CL_SUCCESS);
- } else if(param_name == CL_PROGRAM_KERNEL_NAMES) {
- cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
- } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
- if (program->binary == NULL){
- if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
- program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
- }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
- program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
- }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
- program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
- }else{
- return CL_INVALID_BINARY;
- }
- }
-
- if (program->binary == NULL || program->binary_sz == 0) {
- return CL_OUT_OF_RESOURCES;
- }
- FILL_GETINFO_RET (size_t, 1, (&program->binary_sz), CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_BINARIES) {
- if (param_value_size_ret)
- *param_value_size_ret = sizeof(void*);
- if (!param_value)
- return CL_SUCCESS;
-
- /* param_value points to an array of n
- pointers allocated by the caller */
- if (program->binary == NULL) {
- if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
- program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
- }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
- program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
- }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
- program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
- }else{
- return CL_INVALID_BINARY;
- }
- }
-
- if (program->binary == NULL || program->binary_sz == 0) {
- return CL_OUT_OF_RESOURCES;
- }
-
- memcpy(*((void **)param_value), program->binary, program->binary_sz);
- return CL_SUCCESS;
- } else {
- return CL_INVALID_VALUE;
- }
-
-error:
- return err;
-}
-
-cl_int
-clGetProgramBuildInfo(cl_program program,
- cl_device_id device,
- cl_program_build_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- cl_int err = CL_SUCCESS;
- char * ret_str = "";
-
- CHECK_PROGRAM (program);
- INVALID_DEVICE_IF (device != program->ctx->device);
-
- if (param_name == CL_PROGRAM_BUILD_STATUS) {
- FILL_GETINFO_RET (cl_build_status, 1, &program->build_status, CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
- if (program->is_built && program->build_opts)
- ret_str = program->build_opts;
-
- FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
- } else if (param_name == CL_PROGRAM_BUILD_LOG) {
- FILL_GETINFO_RET (char, program->build_log_sz + 1, program->build_log, CL_SUCCESS);
- if (param_value_size_ret)
- *param_value_size_ret = program->build_log_sz + 1;
- }else if (param_name == CL_PROGRAM_BINARY_TYPE){
-
- FILL_GETINFO_RET (cl_uint, 1, &program->binary_type, CL_SUCCESS);
- } else {
- return CL_INVALID_VALUE;
- }
-
-error:
- return err;
-}
-
-cl_kernel
-clCreateKernel(cl_program program,
- const char * kernel_name,
- cl_int * errcode_ret)
-{
- cl_kernel kernel = NULL;
- cl_int err = CL_SUCCESS;
-
- CHECK_PROGRAM (program);
- if (program->ker_n <= 0) {
- err = CL_INVALID_PROGRAM_EXECUTABLE;
- goto error;
- }
- INVALID_VALUE_IF (kernel_name == NULL);
- kernel = cl_program_create_kernel(program, kernel_name, &err);
-
-error:
- if (errcode_ret)
- *errcode_ret = err;
- return kernel;
-}
-
-cl_int
-clCreateKernelsInProgram(cl_program program,
- cl_uint num_kernels,
- cl_kernel * kernels,
- cl_uint * num_kernels_ret)
-{
- cl_int err = CL_SUCCESS;
-
- CHECK_PROGRAM (program);
- if (program->ker_n <= 0) {
- err = CL_INVALID_PROGRAM_EXECUTABLE;
- goto error;
- }
- if (kernels && num_kernels < program->ker_n) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if(num_kernels_ret)
- *num_kernels_ret = program->ker_n;
-
- if(kernels)
- err = cl_program_create_kernels_in_program(program, kernels);
-
-error:
- return err;
-}
-
-cl_int
-clRetainKernel(cl_kernel kernel)
-{
- cl_int err = CL_SUCCESS;
- CHECK_KERNEL(kernel);
- cl_kernel_add_ref(kernel);
-error:
- return err;
-}
-
-cl_int
-clReleaseKernel(cl_kernel kernel)
-{
- cl_int err = CL_SUCCESS;
- CHECK_KERNEL(kernel);
- cl_kernel_delete(kernel);
-error:
- return err;
-}
-
-cl_int
-clSetKernelArg(cl_kernel kernel,
- cl_uint arg_index,
- size_t arg_size,
- const void * arg_value)
-{
- cl_int err = CL_SUCCESS;
- CHECK_KERNEL(kernel);
-
-#ifdef HAS_CMRT
- if (kernel->cmrt_kernel != NULL)
- err = cmrt_set_kernel_arg(kernel, arg_index, arg_size, arg_value);
- else
-#endif
- err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
-error:
- return err;
-}
-
-cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
- size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
- cl_int err = CL_SUCCESS;
- CHECK_KERNEL(kernel);
-
- if(kernel->program->build_opts == NULL ||
- strstr(kernel->program->build_opts,"-cl-kernel-arg-info") == NULL ) {
- err = CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
- goto error;
- }
- if (param_name != CL_KERNEL_ARG_ADDRESS_QUALIFIER
- && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
- && param_name != CL_KERNEL_ARG_TYPE_NAME
- && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
- && param_name != CL_KERNEL_ARG_NAME) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (arg_index >= kernel->arg_n) {
- err = CL_INVALID_ARG_INDEX;
- goto error;
- }
-
- err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
- param_value, param_value_size_ret);
-
-error:
- return err;
-}
-
-cl_int
-clGetKernelInfo(cl_kernel kernel,
- cl_kernel_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- cl_int err;
-
- CHECK_KERNEL(kernel);
-
- if (param_name == CL_KERNEL_CONTEXT) {
- FILL_GETINFO_RET (cl_context, 1, &kernel->program->ctx, CL_SUCCESS);
- } else if (param_name == CL_KERNEL_PROGRAM) {
- FILL_GETINFO_RET (cl_program, 1, &kernel->program, CL_SUCCESS);
- } else if (param_name == CL_KERNEL_NUM_ARGS) {
- cl_uint n = kernel->arg_n;
- FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
- } else if (param_name == CL_KERNEL_REFERENCE_COUNT) {
- cl_int ref = kernel->ref_n;
- FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
- } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
- const char * n = cl_kernel_get_name(kernel);
- FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
- } else if (param_name == CL_KERNEL_ATTRIBUTES) {
- const char * n = cl_kernel_get_attributes(kernel);
- FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
- } else {
- return CL_INVALID_VALUE;
- }
-
-error:
- return err;
-}
-
-cl_int
-clGetKernelWorkGroupInfo(cl_kernel kernel,
- cl_device_id device,
- cl_kernel_work_group_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- return cl_get_kernel_workgroup_info(kernel,
- device,
- param_name,
- param_value_size,
- param_value,
- param_value_size_ret);
-}
-
-cl_int
-clGetKernelSubGroupInfoKHR(cl_kernel kernel,
- cl_device_id device,
- cl_kernel_work_group_info param_name,
- size_t input_value_size,
- const void * input_value,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- return cl_get_kernel_subgroup_info(kernel,
- device,
- param_name,
- input_value_size,
- input_value,
- param_value_size,
- param_value,
- param_value_size_ret);
-}
-
-cl_int
-clWaitForEvents(cl_uint num_events,
- const cl_event * event_list)
-{
- cl_int err = CL_SUCCESS;
- cl_context ctx = NULL;
-
- if(num_events > 0 && event_list)
- ctx = event_list[0]->ctx;
-
- TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
-
- while(cl_event_wait_events(num_events, event_list, NULL) == CL_ENQUEUE_EXECUTE_DEFER) {
- usleep(8000); //sleep 8ms to wait other thread
- }
-
-error:
- return err;
-}
-
-cl_int
-clGetEventInfo(cl_event event,
- cl_event_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- cl_int err = CL_SUCCESS;
- CHECK_EVENT(event);
-
- if (param_name == CL_EVENT_COMMAND_QUEUE) {
- FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
- } else if (param_name == CL_EVENT_CONTEXT) {
- FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
- } else if (param_name == CL_EVENT_COMMAND_TYPE) {
- FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
- } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
- cl_event_update_status(event, 0);
- FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
- } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
- cl_uint ref = event->ref_n;
- FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
- } else {
- return CL_INVALID_VALUE;
- }
-
-error:
- return err;
-
-}
-
-cl_event
-clCreateUserEvent(cl_context context,
- cl_int * errcode_ret)
-{
- cl_int err = CL_SUCCESS;
- cl_event event = NULL;
- CHECK_CONTEXT(context);
-
- TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER, CL_TRUE));
-
-error:
- if(errcode_ret)
- *errcode_ret = err;
- return event;
-}
-
-cl_int
-clRetainEvent(cl_event event)
-{
- cl_int err = CL_SUCCESS;
-
- CHECK_EVENT(event);
- cl_event_add_ref(event);
-
-error:
- return err;
-}
-
-cl_int
-clReleaseEvent(cl_event event)
-{
- cl_int err = CL_SUCCESS;
-
- CHECK_EVENT(event);
- cl_event_delete(event);
-
-error:
- return err;
-}
-
-cl_int
-clSetUserEventStatus(cl_event event,
- cl_int execution_status)
-{
- cl_int err = CL_SUCCESS;
-
- CHECK_EVENT(event);
- if(execution_status > CL_COMPLETE) {
- err = CL_INVALID_VALUE;
- goto error;
- }
- if(event->status != CL_SUBMITTED) {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
- cl_event_set_status(event, execution_status);
-error:
- return err;
-}
-
-cl_int
-clSetEventCallback(cl_event event,
- cl_int command_exec_callback_type,
- void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *),
- void * user_data)
-{
- cl_int err = CL_SUCCESS;
-
- CHECK_EVENT(event);
- if((pfn_notify == NULL) ||
- (command_exec_callback_type > CL_SUBMITTED) ||
- (command_exec_callback_type < CL_COMPLETE)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
- err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
-
-error:
- return err;
-
-}
-
-cl_int
-clGetEventProfilingInfo(cl_event event,
- cl_profiling_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- cl_int err = CL_SUCCESS;
- cl_ulong ret_val;
-
- CHECK_EVENT(event);
- cl_event_update_status(event, 0);
-
- if (event->type == CL_COMMAND_USER ||
- !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
- event->status != CL_COMPLETE) {
- err = CL_PROFILING_INFO_NOT_AVAILABLE;
- goto error;
- }
-
- if (param_value && param_value_size < sizeof(cl_ulong)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (param_name == CL_PROFILING_COMMAND_QUEUED) {
- ret_val = event->queued_timestamp;
- } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
- ret_val= event->queued_timestamp + cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[1]);
- } else if (param_name == CL_PROFILING_COMMAND_START) {
- err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
- ret_val = event->queued_timestamp + cl_event_get_start_timestamp(event);
- } else if (param_name == CL_PROFILING_COMMAND_END) {
- err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
- ret_val = event->queued_timestamp + cl_event_get_end_timestamp(event);
- } else {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (err == CL_SUCCESS) {
- if (param_value)
- *(cl_ulong*)param_value = ret_val;
- if (param_value_size_ret)
- *param_value_size_ret = sizeof(cl_ulong);
- }
-error:
- return err;
-}
-
-cl_int
-clFlush(cl_command_queue command_queue)
-{
- /* have nothing to do now, as currently
- * clEnqueueNDRangeKernel will flush at
- * the end of each calling. we may need
- * to optimize it latter.*/
- return 0;
-}
-
-cl_int
-clFinish(cl_command_queue command_queue)
-{
- cl_int err = CL_SUCCESS;
-
- CHECK_QUEUE (command_queue);
-
-#ifdef HAS_CMRT
- if (command_queue->cmrt_event != NULL)
- return cmrt_wait_for_task_finished(command_queue);
-#endif
-
- err = cl_command_queue_finish(command_queue);
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueReadBuffer(cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_read,
- size_t offset,
- size_t size,
- void * ptr,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, defer_enqueue_data = { 0 };
- CHECK_QUEUE(command_queue);
- CHECK_MEM(buffer);
- if (command_queue->ctx != buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (!ptr || !size || offset + size > buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-
- data = &defer_enqueue_data;
- data->type = EnqueueReadBuffer;
- data->mem_obj = buffer;
- data->ptr = ptr;
- data->offset = offset;
- data->size = size;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- }
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueReadBufferRect(cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_read,
- const size_t * buffer_origin,
- const size_t * host_origin,
- const size_t * region,
- size_t buffer_row_pitch,
- size_t buffer_slice_pitch,
- size_t host_row_pitch,
- size_t host_slice_pitch,
- void * ptr,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(buffer);
-
- if (command_queue->ctx != buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
- if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if(buffer_row_pitch == 0)
- buffer_row_pitch = region[0];
- if(buffer_slice_pitch == 0)
- buffer_slice_pitch = region[1] * buffer_row_pitch;
-
- if(host_row_pitch == 0)
- host_row_pitch = region[0];
- if(host_slice_pitch == 0)
- host_slice_pitch = region[1] * host_row_pitch;
-
- if (buffer_row_pitch < region[0] ||
- host_row_pitch < region[0]) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
- (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
- + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
- + buffer_origin[0] + region[0] > buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueReadBufferRect;
- data->mem_obj = buffer;
- data->ptr = ptr;
- data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
- data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
- data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
- data->row_pitch = buffer_row_pitch;
- data->slice_pitch = buffer_slice_pitch;
- data->host_row_pitch = host_row_pitch;
- data->host_slice_pitch = host_slice_pitch;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- }
-
- error:
- return err;
-}
-
-cl_int
-clEnqueueWriteBuffer(cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_write,
- size_t offset,
- size_t size,
- const void * ptr,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(buffer);
- if (command_queue->ctx != buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (!ptr || !size || offset + size > buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueWriteBuffer;
- data->mem_obj = buffer;
- data->const_ptr = ptr;
- data->offset = offset;
- data->size = size;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- }
-
- error:
- return err;
-}
-
-cl_int
-clEnqueueWriteBufferRect(cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_write,
- const size_t * buffer_origin,
- const size_t * host_origin,
- const size_t * region,
- size_t buffer_row_pitch,
- size_t buffer_slice_pitch,
- size_t host_row_pitch,
- size_t host_slice_pitch,
- const void * ptr,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(buffer);
-
- if (command_queue->ctx != buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
- if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if(buffer_row_pitch == 0)
- buffer_row_pitch = region[0];
- if(buffer_slice_pitch == 0)
- buffer_slice_pitch = region[1] * buffer_row_pitch;
-
- if(host_row_pitch == 0)
- host_row_pitch = region[0];
- if(host_slice_pitch == 0)
- host_slice_pitch = region[1] * host_row_pitch;
-
- if (buffer_row_pitch < region[0] ||
- host_row_pitch < region[0]) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
- (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
- + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
- + buffer_origin[0] + region[0] > buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueWriteBufferRect;
- data->mem_obj = buffer;
- data->const_ptr = ptr;
- data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
- data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
- data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
- data->row_pitch = buffer_row_pitch;
- data->slice_pitch = buffer_slice_pitch;
- data->host_row_pitch = host_row_pitch;
- data->host_slice_pitch = host_slice_pitch;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- }
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueFillImage(cl_command_queue command_queue,
- cl_mem image,
- const void * fill_color,
- const size_t * porigin,
- const size_t * pregion,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_IMAGE(image, src_image);
- FIXUP_IMAGE_REGION(src_image, pregion, region);
- FIXUP_IMAGE_ORIGIN(src_image, porigin, origin);
-
- if (command_queue->ctx != image->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (fill_color == NULL) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!origin || !region || origin[0] + region[0] > src_image->w || origin[1] + region[1] > src_image->h || origin[2] + region[2] > src_image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)){
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 ||origin[1] != 0 || region[2] != 1 || region[1] != 1)){
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- err = cl_image_fill(command_queue, fill_color, src_image, origin, region);
- if (err) {
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueFillImage;
- data->queue = command_queue;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
- if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_fill_image", "", command_queue);
-
- return 0;
-
- error:
- return err;
-}
-
-cl_int
-clEnqueueFillBuffer(cl_command_queue command_queue,
- cl_mem buffer,
- const void * pattern,
- size_t pattern_size,
- size_t offset,
- size_t size,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
- static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
- int i = 0;
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(buffer);
-
- if (command_queue->ctx != buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (offset + size > buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (pattern == NULL) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
- if (valid_sz[i] == pattern_size)
- break;
- }
- if (i == sizeof(valid_sz) / sizeof(size_t)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (offset % pattern_size || size % pattern_size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
- if (err) {
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueFillBuffer;
- data->queue = command_queue;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
- if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue);
-
- return 0;
-
- error:
- return err;
-}
-
-cl_int
-clEnqueueCopyBuffer(cl_command_queue command_queue,
- cl_mem src_buffer,
- cl_mem dst_buffer,
- size_t src_offset,
- size_t dst_offset,
- size_t cb,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(src_buffer);
- CHECK_MEM(dst_buffer);
-
- if (command_queue->ctx != src_buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (command_queue->ctx != dst_buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (src_offset + cb > src_buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
- if (dst_offset + cb > dst_buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- /* Check overlap */
- if (src_buffer == dst_buffer
- && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
- && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
- err = CL_MEM_COPY_OVERLAP;
- goto error;
- }
-
- /* Check sub overlap */
- if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE ) {
- struct _cl_mem_buffer* src_b = (struct _cl_mem_buffer*)src_buffer;
- struct _cl_mem_buffer* dst_b = (struct _cl_mem_buffer*)dst_buffer;
- size_t src_sub_offset = src_b->sub_offset;
- size_t dst_sub_offset = dst_b->sub_offset;
-
- if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset
- && dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1)
- && (dst_offset + dst_sub_offset <= src_offset + src_sub_offset
- && src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) {
- err = CL_MEM_COPY_OVERLAP;
- goto error;
- }
- }
-
- err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb);
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueCopyBuffer;
- data->queue = command_queue;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
- if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue);
-
- return 0;
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueCopyBufferRect(cl_command_queue command_queue,
- cl_mem src_buffer,
- cl_mem dst_buffer,
- const size_t * src_origin,
- const size_t * dst_origin,
- const size_t * region,
- size_t src_row_pitch,
- size_t src_slice_pitch,
- size_t dst_row_pitch,
- size_t dst_slice_pitch,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(src_buffer);
- CHECK_MEM(dst_buffer);
-
- if ((command_queue->ctx != src_buffer->ctx) ||
- (command_queue->ctx != dst_buffer->ctx)) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if(src_row_pitch == 0)
- src_row_pitch = region[0];
- if(src_slice_pitch == 0)
- src_slice_pitch = region[1] * src_row_pitch;
-
- if(dst_row_pitch == 0)
- dst_row_pitch = region[0];
- if(dst_slice_pitch == 0)
- dst_slice_pitch = region[1] * dst_row_pitch;
-
- if (src_row_pitch < region[0] ||
- dst_row_pitch < region[0]) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
- (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((src_origin[2] + region[2] - 1) * src_slice_pitch
- + (src_origin[1] + region[1] - 1) * src_row_pitch
- + src_origin[0] + region[0] > src_buffer->size
- ||(dst_origin[2] + region[2] - 1) * dst_slice_pitch
- + (dst_origin[1] + region[1] - 1) * dst_row_pitch
- + dst_origin[0] + region[0] > dst_buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (src_buffer == dst_buffer &&
- check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
- err = CL_MEM_COPY_OVERLAP;
- goto error;
- }
-
- cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
- src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueCopyBufferRect;
- data->queue = command_queue;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
- if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue);
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueReadImage(cl_command_queue command_queue,
- cl_mem mem,
- cl_bool blocking_read,
- const size_t * porigin,
- const size_t * pregion,
- size_t row_pitch,
- size_t slice_pitch,
- void * ptr,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_IMAGE(mem, image);
- FIXUP_IMAGE_REGION(image, pregion, region);
- FIXUP_IMAGE_ORIGIN(image, porigin, origin);
- if (command_queue->ctx != mem->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!row_pitch)
- row_pitch = image->bpp*region[0];
- else if (row_pitch < image->bpp*region[0]) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (image->slice_pitch) {
- if (!slice_pitch)
- slice_pitch = row_pitch*region[1];
- else if (slice_pitch < row_pitch*region[1]) {
- err = CL_INVALID_VALUE;
- goto error;
- }
- }
- else if (slice_pitch) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!ptr) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueReadImage;
- data->mem_obj = mem;
- data->ptr = ptr;
- data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
- data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
- data->row_pitch = row_pitch;
- data->slice_pitch = slice_pitch;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- }
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueWriteImage(cl_command_queue command_queue,
- cl_mem mem,
- cl_bool blocking_write,
- const size_t * porigin,
- const size_t * pregion,
- size_t row_pitch,
- size_t slice_pitch,
- const void * ptr,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_IMAGE(mem, image);
- FIXUP_IMAGE_REGION(image, pregion, region);
- FIXUP_IMAGE_ORIGIN(image, porigin, origin);
- if (command_queue->ctx != mem->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!row_pitch)
- row_pitch = image->bpp*region[0];
- else if (row_pitch < image->bpp*region[0]) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (image->slice_pitch) {
- if (!slice_pitch)
- slice_pitch = row_pitch*region[1];
- else if (slice_pitch < row_pitch*region[1]) {
- err = CL_INVALID_VALUE;
- goto error;
- }
- }
- else if (slice_pitch) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!ptr) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueWriteImage;
- data->mem_obj = mem;
- data->const_ptr = ptr;
- data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
- data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
- data->row_pitch = row_pitch;
- data->slice_pitch = slice_pitch;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- }
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueCopyImage(cl_command_queue command_queue,
- cl_mem src_mem,
- cl_mem dst_mem,
- const size_t * psrc_origin,
- const size_t * pdst_origin,
- const size_t * pregion,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
- cl_bool overlap = CL_TRUE;
- cl_int i = 0;
-
- CHECK_QUEUE(command_queue);
- CHECK_IMAGE(src_mem, src_image);
- CHECK_IMAGE(dst_mem, dst_image);
- FIXUP_IMAGE_REGION(src_image, pregion, region);
- FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
- FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
- if (command_queue->ctx != src_mem->ctx ||
- command_queue->ctx != dst_mem->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
- src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
- err = CL_IMAGE_FORMAT_MISMATCH;
- goto error;
- }
-
- if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
- src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
- dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
- (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (src_image == dst_image) {
- for(i = 0; i < 3; i++)
- overlap = overlap && (src_origin[i] < dst_origin[i] + region[i])
- && (dst_origin[i] < src_origin[i] + region[i]);
- if(overlap == CL_TRUE) {
- err = CL_MEM_COPY_OVERLAP;
- goto error;
- }
- }
-
- cl_mem_kernel_copy_image(command_queue, src_image, dst_image, src_origin, dst_origin, region);
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueCopyImage;
- data->queue = command_queue;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
- if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue);
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
- cl_mem src_mem,
- cl_mem dst_buffer,
- const size_t * psrc_origin,
- const size_t * pregion,
- size_t dst_offset,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_IMAGE(src_mem, src_image);
- CHECK_MEM(dst_buffer);
- FIXUP_IMAGE_REGION(src_image, pregion, region);
- FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
- if (command_queue->ctx != src_mem->ctx ||
- command_queue->ctx != dst_buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp > dst_buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
- src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer, src_origin, dst_offset, region);
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueCopyImageToBuffer;
- data->queue = command_queue;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
- if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue);
-
-error:
- return err;
-}
-
-cl_int
-clEnqueueCopyBufferToImage(cl_command_queue command_queue,
- cl_mem src_buffer,
- cl_mem dst_mem,
- size_t src_offset,
- const size_t * pdst_origin,
- const size_t * pregion,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
-{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(src_buffer);
- CHECK_IMAGE(dst_mem, dst_image);
- FIXUP_IMAGE_REGION(dst_image, pregion, region);
- FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
- if (command_queue->ctx != src_buffer->ctx ||
- command_queue->ctx != dst_mem->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp > src_buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
- dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image, src_offset, dst_origin, region);
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, dst_mem->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueCopyBufferToImage;
- data->queue = command_queue;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
- if(b_output_kernel_perf)
- time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue);
-
-error:
- return err;
-}
-
-static cl_int _cl_map_mem(cl_mem mem, void *ptr, void **mem_ptr,
- size_t offset, size_t size,
- const size_t *origin, const size_t *region)
-{
- cl_int slot = -1;
- int err = CL_SUCCESS;
- size_t sub_offset = 0;
-
- if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
- struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
- sub_offset = buffer->sub_offset;
- }
-
- ptr = (char*)ptr + offset + sub_offset;
- if(mem->flags & CL_MEM_USE_HOST_PTR) {
- assert(mem->host_ptr);
- //only calc ptr here, will do memcpy in enqueue
- *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
- } else {
- *mem_ptr = ptr;
- }
- /* Record the mapped address. */
- if (!mem->mapped_ptr_sz) {
- mem->mapped_ptr_sz = 16;
- mem->mapped_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
- if (!mem->mapped_ptr) {
- cl_mem_unmap_auto(mem);
- err = CL_OUT_OF_HOST_MEMORY;
- goto error;
- }
- memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- slot = 0;
- } else {
- int i = 0;
- for (; i < mem->mapped_ptr_sz; i++) {
- if (mem->mapped_ptr[i].ptr == NULL) {
- slot = i;
- break;
- }
- }
- if (i == mem->mapped_ptr_sz) {
- cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
- if (!new_ptr) {
- cl_mem_unmap_auto(mem);
- err = CL_OUT_OF_HOST_MEMORY;
- goto error;
- }
- memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- memcpy(new_ptr, mem->mapped_ptr,
- mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- slot = mem->mapped_ptr_sz;
- mem->mapped_ptr_sz *= 2;
- free(mem->mapped_ptr);
- mem->mapped_ptr = new_ptr;
- }
- }
- assert(slot != -1);
- mem->mapped_ptr[slot].ptr = *mem_ptr;
- mem->mapped_ptr[slot].v_ptr = ptr;
- mem->mapped_ptr[slot].size = size;
- if(origin) {
- assert(region);
- mem->mapped_ptr[slot].origin[0] = origin[0];
- mem->mapped_ptr[slot].origin[1] = origin[1];
- mem->mapped_ptr[slot].origin[2] = origin[2];
- mem->mapped_ptr[slot].region[0] = region[0];
- mem->mapped_ptr[slot].region[1] = region[1];
- mem->mapped_ptr[slot].region[2] = region[2];
- }
- mem->map_ref++;
-error:
- if (err != CL_SUCCESS)
- *mem_ptr = NULL;
- return err;
-}
-
-void *
-clEnqueueMapBuffer(cl_command_queue command_queue,
- cl_mem buffer,
- cl_bool blocking_map,
- cl_map_flags map_flags,
- size_t offset,
- size_t size,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event,
- cl_int * errcode_ret)
-{
- cl_int err = CL_SUCCESS;
- void *ptr = NULL;
- void *mem_ptr = NULL;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(buffer);
- if (command_queue->ctx != buffer->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (!size || offset + size > buffer->size) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((map_flags & CL_MAP_READ &&
- buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
- (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
- buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
- {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
-#ifdef HAS_CMRT
- if (command_queue->cmrt_event != NULL)
- cmrt_wait_for_task_finished(command_queue);
-#endif
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueMapBuffer;
- data->mem_obj = buffer;
- data->offset = offset;
- data->size = size;
- data->ptr = ptr;
- data->unsync_map = 1;
- if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
- data->write_map = 1;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- data->unsync_map = 0;
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if (err != CL_SUCCESS)
- goto error;
- ptr = data->ptr;
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- } else {
- if (buffer->is_userptr)
- ptr = buffer->host_ptr;
- else {
- if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
- err = CL_MAP_FAILURE;
- goto error;
- }
- }
- }
- err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
- if (err != CL_SUCCESS)
- goto error;
-
-error:
- if (errcode_ret)
- *errcode_ret = err;
- return mem_ptr;
-}
-
-void *
-clEnqueueMapImage(cl_command_queue command_queue,
- cl_mem mem,
- cl_bool blocking_map,
- cl_map_flags map_flags,
- const size_t * porigin,
- const size_t * pregion,
- size_t * image_row_pitch,
- size_t * image_slice_pitch,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event,
- cl_int * errcode_ret)
-{
- cl_int err = CL_SUCCESS;
- void *ptr = NULL;
- void *mem_ptr = NULL;
- size_t offset = 0;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_IMAGE(mem, image);
- FIXUP_IMAGE_REGION(image, pregion, region);
- FIXUP_IMAGE_ORIGIN(image, porigin, origin);
- if (command_queue->ctx != mem->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
-
- if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
- err = CL_INVALID_VALUE;
- goto error;
- }
-
- if ((map_flags & CL_MAP_READ &&
- mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
- (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
- mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
- {
- err = CL_INVALID_OPERATION;
- goto error;
- }
-
-#ifdef HAS_CMRT
- if (command_queue->cmrt_event != NULL)
- cmrt_wait_for_task_finished(command_queue);
-#endif
+ INVALID_VALUE_IF (num_input_programs == 0 && input_programs == NULL);
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueMapImage;
- data->mem_obj = mem;
- data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
- data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
- data->ptr = ptr;
- data->unsync_map = 1;
- if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
- data->write_map = 1;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- data->unsync_map = 0;
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if (err != CL_SUCCESS)
- goto error;
- ptr = data->ptr;
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- } else {
- if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
- err = CL_MAP_FAILURE;
- goto error;
- }
- }
+ program = cl_program_link(context, num_input_programs, input_programs, options, &err);
- if(mem->flags & CL_MEM_USE_HOST_PTR) {
- if (image_slice_pitch)
- *image_slice_pitch = image->host_slice_pitch;
- *image_row_pitch = image->host_row_pitch;
+ if(program) program->is_built = CL_TRUE;
- offset = image->bpp*origin[0] + image->host_row_pitch*origin[1] + image->host_slice_pitch*origin[2];
- } else {
- if (image_slice_pitch)
- *image_slice_pitch = image->slice_pitch;
- if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
- *image_row_pitch = image->slice_pitch;
- else
- *image_row_pitch = image->row_pitch;
-
- offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
- }
- err = _cl_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
+ if (pfn_notify) pfn_notify(program, user_data);
error:
if (errcode_ret)
*errcode_ret = err;
- return mem_ptr; //TODO: map and unmap first
+ return program;
}
cl_int
-clEnqueueUnmapMemObject(cl_command_queue command_queue,
- cl_mem memobj,
- void * mapped_ptr,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
+clUnloadCompiler(void)
{
- cl_int err = CL_SUCCESS;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
- CHECK_MEM(memobj);
- if (command_queue->ctx != memobj->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
+ return CL_SUCCESS;
+}
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, memobj->ctx);
+cl_int
+clUnloadPlatformCompiler(cl_platform_id platform)
+{
+ return CL_SUCCESS;
+}
- data = &no_wait_data;
- data->type = EnqueueUnmapMemObject;
- data->mem_obj = memobj;
- data->ptr = mapped_ptr;
+cl_kernel
+clCreateKernel(cl_program program,
+ const char * kernel_name,
+ cl_int * errcode_ret)
+{
+ cl_kernel kernel = NULL;
+ cl_int err = CL_SUCCESS;
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
+ CHECK_PROGRAM (program);
+ if (program->ker_n <= 0) {
+ err = CL_INVALID_PROGRAM_EXECUTABLE;
+ goto error;
}
+ INVALID_VALUE_IF (kernel_name == NULL);
+ kernel = cl_program_create_kernel(program, kernel_name, &err);
error:
- return err;
+ if (errcode_ret)
+ *errcode_ret = err;
+ return kernel;
}
cl_int
-clEnqueueMigrateMemObjects(cl_command_queue command_queue,
- cl_uint num_mem_objects,
- const cl_mem * mem_objects,
- cl_mem_migration_flags flags,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
+clCreateKernelsInProgram(cl_program program,
+ cl_uint num_kernels,
+ cl_kernel * kernels,
+ cl_uint * num_kernels_ret)
{
- /* So far, we just support 1 device and no subdevice. So all the command queues
- belong to the small context. There is no need to migrate the mem objects by now. */
cl_int err = CL_SUCCESS;
- cl_uint i = 0;
- enqueue_data *data, defer_enqueue_data = { 0 };
- if (!flags & CL_MIGRATE_MEM_OBJECT_HOST)
- CHECK_QUEUE(command_queue);
-
- if (num_mem_objects == 0 || mem_objects == NULL) {
- err = CL_INVALID_VALUE;
+ CHECK_PROGRAM (program);
+ if (program->ker_n <= 0) {
+ err = CL_INVALID_PROGRAM_EXECUTABLE;
goto error;
}
-
- if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST |
- CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) {
+ if (kernels && num_kernels < program->ker_n) {
err = CL_INVALID_VALUE;
goto error;
}
- for (i = 0; i < num_mem_objects; i++) {
- CHECK_MEM(mem_objects[i]);
- if (mem_objects[i]->ctx != command_queue->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
- }
-
- /* really nothing to do, fill the event. */
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
- data = &defer_enqueue_data;
- data->type = EnqueueMigrateMemObj;
+ if(num_kernels_ret)
+ *num_kernels_ret = program->ker_n;
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
- }
+ if(kernels)
+ err = cl_program_create_kernels_in_program(program, kernels);
error:
return err;
}
cl_int
-clEnqueueNDRangeKernel(cl_command_queue command_queue,
- cl_kernel kernel,
- cl_uint work_dim,
- const size_t * global_work_offset,
- const size_t * global_work_size,
- const size_t * local_work_size,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
+clRetainKernel(cl_kernel kernel)
{
- size_t fixed_global_off[] = {0,0,0};
- size_t fixed_global_sz[] = {1,1,1};
- size_t fixed_local_sz[] = {1,1,1};
cl_int err = CL_SUCCESS;
- cl_uint i;
- enqueue_data *data, no_wait_data = { 0 };
-
- CHECK_QUEUE(command_queue);
CHECK_KERNEL(kernel);
+ cl_kernel_add_ref(kernel);
+error:
+ return err;
+}
- /* Check number of dimensions we have */
- if (UNLIKELY(work_dim == 0 || work_dim > 3)) {
- err = CL_INVALID_WORK_DIMENSION;
- goto error;
- }
-
- /* We need a work size per dimension */
- if (UNLIKELY(global_work_size == NULL)) {
- err = CL_INVALID_GLOBAL_WORK_SIZE;
- goto error;
- }
-
- if (kernel->vme) {
- if (work_dim != 2) {
- err = CL_INVALID_WORK_DIMENSION;
- goto error;
- }
- if (local_work_size != NULL) {
- err = CL_INVALID_WORK_GROUP_SIZE;
- goto error;
- }
- }
-
- if (global_work_offset != NULL)
- for (i = 0; i < work_dim; ++i) {
- if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
- err = CL_INVALID_GLOBAL_OFFSET;
- goto error;
- }
- }
-
- /* Local sizes must be non-null and divide global sizes */
- if (local_work_size != NULL)
- for (i = 0; i < work_dim; ++i)
- if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
- err = CL_INVALID_WORK_GROUP_SIZE;
- goto error;
- }
+cl_int
+clReleaseKernel(cl_kernel kernel)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
+ cl_kernel_delete(kernel);
+error:
+ return err;
+}
- /* Queue and kernel must share the same context */
- assert(kernel->program);
- if (command_queue->ctx != kernel->program->ctx) {
- err = CL_INVALID_CONTEXT;
- goto error;
- }
+cl_int
+clSetKernelArg(cl_kernel kernel,
+ cl_uint arg_index,
+ size_t arg_size,
+ const void * arg_value)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
#ifdef HAS_CMRT
- if (kernel->cmrt_kernel != NULL) {
- err = cmrt_enqueue(command_queue, kernel, global_work_size, local_work_size);
- goto error;
- }
+ if (kernel->cmrt_kernel != NULL)
+ err = cmrt_set_kernel_arg(kernel, arg_index, arg_size, arg_value);
+ else
#endif
-
- /* XXX No event right now */
- //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
- //FATAL_IF(event_wait_list != NULL, "Events are not supported");
- //FATAL_IF(event != NULL, "Events are not supported");
-
- if (local_work_size != NULL) {
- for (i = 0; i < work_dim; ++i)
- fixed_local_sz[i] = local_work_size[i];
- } else {
- if (kernel->vme) {
- fixed_local_sz[0] = 16;
- fixed_local_sz[1] = 1;
- } else {
- uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
- size_t realGroupSize = 1;
- for (i = 0; i< work_dim; i++) {
- for (j = maxDimSize; j > 1; j--) {
- if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
- fixed_local_sz[i] = j;
- maxGroupSize = maxGroupSize /j;
- maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
- break; //choose next work_dim
- }
- }
- realGroupSize *= fixed_local_sz[i];
- }
-
- //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop:
- //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
- //it triggers the following message for many times.
- //to avoid too many messages, only print it for the first time of the process.
- //just use static variable since it doesn't matter to print a few times at multi-thread case.
- static int warn_no_good_localsize = 1;
- if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
- warn_no_good_localsize = 0;
- DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide local_work_size[] explicitly, you can find good values with trial-and-error method.");
- }
- }
- }
-
- if (kernel->vme) {
- fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;
- fixed_global_sz[1] = (global_work_size[1]+15) / 16;
- } else {
- for (i = 0; i < work_dim; ++i)
- fixed_global_sz[i] = global_work_size[i];
- }
- if (global_work_offset != NULL)
- for (i = 0; i < work_dim; ++i)
- fixed_global_off[i] = global_work_offset[i];
-
- if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
- if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
- || fixed_local_sz[1] != kernel->compile_wg_sz[1]
- || fixed_local_sz[2] != kernel->compile_wg_sz[2])
- {
- err = CL_INVALID_WORK_GROUP_SIZE;
- goto error;
- }
- }
-
- /* Do device specific checks are enqueue the kernel */
- err = cl_command_queue_ND_range(command_queue,
- kernel,
- work_dim,
- fixed_global_off,
- fixed_global_sz,
- fixed_local_sz);
- if(err != CL_SUCCESS)
- goto error;
-
- data = &no_wait_data;
- data->type = EnqueueNDRangeKernel;
- data->queue = command_queue;
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
- if (event && (*event)->type != CL_COMMAND_USER
- && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
- }
-
- err = cl_command_queue_flush(command_queue);
- }
-
+ err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
error:
- if(b_output_kernel_perf)
- {
- if(kernel->program->build_opts != NULL)
- time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
- else
- time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
- }
-
return err;
}
cl_int
-clEnqueueTask(cl_command_queue command_queue,
- cl_kernel kernel,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
+clSetKernelArgSVMPointer(cl_kernel kernel,
+ cl_uint arg_index,
+ const void *arg_value)
{
- const size_t global_size[3] = {1, 0, 0};
- const size_t local_size[3] = {1, 0, 0};
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
- return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size,
- num_events_in_wait_list, event_wait_list, event);
+ err = cl_kernel_set_arg_svm_pointer(kernel, arg_index, arg_value);
+error:
+ return err;
}
-
cl_int
-clEnqueueNativeKernel(cl_command_queue command_queue,
- void (*user_func)(void *),
- void * args,
- size_t cb_args,
- cl_uint num_mem_objects,
- const cl_mem * mem_list,
- const void ** args_mem_loc,
- cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list,
- cl_event * event)
+clSetKernelExecInfo(cl_kernel kernel,
+ cl_kernel_exec_info param_name,
+ size_t param_value_size,
+ const void *param_value)
{
+
cl_int err = CL_SUCCESS;
- void *new_args = NULL;
- enqueue_data *data, no_wait_data = { 0 };
- cl_int i;
-
- if(user_func == NULL ||
- (args == NULL && cb_args > 0) ||
- (args == NULL && num_mem_objects ==0) ||
- (args != NULL && cb_args == 0) ||
- (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
- (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+ CHECK_KERNEL(kernel);
+
+ if((param_name != CL_KERNEL_EXEC_INFO_SVM_PTRS &&
+ param_name != CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM) ||
+ param_value == NULL || param_value_size == 0) {
err = CL_INVALID_VALUE;
goto error;
}
- //Per spec, need copy args
- if (cb_args)
- {
- new_args = malloc(cb_args);
- if (!new_args)
- {
- err = CL_OUT_OF_HOST_MEMORY;
- goto error;
- }
- memcpy(new_args, args, cb_args);
-
- for (i=0; i<num_mem_objects; ++i)
- {
- CHECK_MEM(mem_list[i]);
- args_mem_loc[i] = new_args + (args_mem_loc[i] - args); //change to new args
- }
- }
-
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
-
- data = &no_wait_data;
- data->type = EnqueueNativeKernel;
- data->mem_list = mem_list;
- data->ptr = new_args;
- data->size = cb_args;
- data->offset = (size_t)num_mem_objects;
- data->const_ptr = args_mem_loc;
- data->user_func = user_func;
-
- if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(event ? *event : NULL, data);
- if(event) cl_event_set_status(*event, CL_COMPLETE);
+ if(param_name == CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM &&
+ *(cl_bool *)param_value == CL_TRUE) {
+ err = CL_INVALID_OPERATION;
+ goto error;
}
+ err = cl_kernel_set_exec_info(kernel, param_value_size, param_value);
error:
return err;
}
-cl_int
-clEnqueueMarker(cl_command_queue command_queue,
- cl_event *event)
+cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
+ size_t param_value_size, void *param_value, size_t *param_value_size_ret)
{
cl_int err = CL_SUCCESS;
- CHECK_QUEUE(command_queue);
- if(event == NULL) {
+ CHECK_KERNEL(kernel);
+
+ if(kernel->program->build_opts == NULL ||
+ strstr(kernel->program->build_opts,"-cl-kernel-arg-info") == NULL ) {
+ err = CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
+ goto error;
+ }
+ if (param_name != CL_KERNEL_ARG_ADDRESS_QUALIFIER
+ && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
+ && param_name != CL_KERNEL_ARG_TYPE_NAME
+ && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
+ && param_name != CL_KERNEL_ARG_NAME) {
err = CL_INVALID_VALUE;
goto error;
}
- cl_event_marker_with_wait_list(command_queue, 0, NULL, event);
+ if (arg_index >= kernel->arg_n) {
+ err = CL_INVALID_ARG_INDEX;
+ goto error;
+ }
+
+ err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
+ param_value, param_value_size_ret);
+
error:
return err;
}
cl_int
-clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event)
+clGetKernelWorkGroupInfo(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ return cl_get_kernel_workgroup_info(kernel,
+ device,
+ param_name,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetKernelSubGroupInfoKHR(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t input_value_size,
+ const void * input_value,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ return cl_get_kernel_subgroup_info(kernel,
+ device,
+ param_name,
+ input_value_size,
+ input_value,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clRetainEvent(cl_event event)
{
cl_int err = CL_SUCCESS;
- CHECK_QUEUE(command_queue);
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+ CHECK_EVENT(event);
+ cl_event_add_ref(event);
- cl_event_marker_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
error:
return err;
}
cl_int
-clEnqueueWaitForEvents(cl_command_queue command_queue,
- cl_uint num_events,
- const cl_event * event_list)
+clReleaseEvent(cl_event event)
{
cl_int err = CL_SUCCESS;
- CHECK_QUEUE(command_queue);
- err = clWaitForEvents(num_events, event_list);
+
+ CHECK_EVENT(event);
+ cl_event_delete(event);
error:
return err;
}
-cl_int
-clEnqueueBarrier(cl_command_queue command_queue)
+cl_mem clCreatePipe (cl_context context,
+ cl_mem_flags flags,
+ cl_uint pipe_packet_size,
+ cl_uint pipe_max_packets,
+ const cl_pipe_properties *properties,
+ cl_int *errcode_ret)
{
+ cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
- CHECK_QUEUE(command_queue);
+ cl_uint device_max_size = 0;
+
+ CHECK_CONTEXT (context);
+
+ if(UNLIKELY((flags & ~(CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS)) != 0)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(UNLIKELY(properties != NULL)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
- cl_event_barrier_with_wait_list(command_queue, 0, NULL, NULL);
+ if(UNLIKELY(pipe_packet_size == 0 || pipe_max_packets == 0)) {
+ err = CL_INVALID_PIPE_SIZE;
+ goto error;
+ }
+ if ((err = cl_get_device_info(context->devices[0],
+ CL_DEVICE_PIPE_MAX_PACKET_SIZE,
+ sizeof(device_max_size),
+ &device_max_size,
+ NULL)) != CL_SUCCESS) {
+ goto error;
+ }
+
+ if(UNLIKELY(pipe_packet_size > device_max_size)) {
+ err = CL_INVALID_PIPE_SIZE;
+ goto error;
+ }
+
+ if(flags == 0)
+ flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
+
+ mem = cl_mem_new_pipe(context, flags, pipe_packet_size, pipe_max_packets, &err);
error:
- return err;
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
}
-cl_int
-clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event)
+cl_int clGetPipeInfo (cl_mem pipe,
+ cl_pipe_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
cl_int err = CL_SUCCESS;
- CHECK_QUEUE(command_queue);
+ CHECK_MEM(pipe);
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+ err = cl_get_pipe_info(pipe,
+ param_name,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
- cl_event_barrier_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
error:
return err;
}
@@ -3566,7 +1534,8 @@ clGetAcceleratorInfoINTEL(cl_accelerator_intel accel,
CHECK_ACCELERATOR_INTEL(accel);
if (param_name == CL_ACCELERATOR_REFERENCE_COUNT_INTEL) {
- FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&accel->ref_n, CL_SUCCESS);
+ cl_uint ref = CL_OBJECT_GET_REF(accel);
+ FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
} else if (param_name == CL_ACCELERATOR_CONTEXT_INTEL) {
FILL_GETINFO_RET (cl_context, 1, &accel->ctx, CL_SUCCESS);
} else if (param_name == CL_ACCELERATOR_TYPE_INTEL) {
diff --git a/src/cl_api_command_queue.c b/src/cl_api_command_queue.c
new file mode 100644
index 0000000..b1aee12
--- /dev/null
+++ b/src/cl_api_command_queue.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_command_queue.h"
+#include "cl_device_id.h"
+#include "CL/cl.h"
+#include <stdio.h>
+
+/* Depreciated in 2.0 later */
+cl_command_queue
+clCreateCommandQueue(cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret)
+{
+ cl_command_queue queue = NULL;
+ cl_int err = CL_SUCCESS;
+
+ do {
+ if (!CL_OBJECT_IS_CONTEXT(context)) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ err = cl_devices_list_include_check(context->device_num, context->devices, 1, &device);
+ if (err)
+ break;
+
+ if (properties & ~(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { /*not supported now.*/
+ err = CL_INVALID_QUEUE_PROPERTIES;
+ break;
+ }
+
+ queue = cl_create_command_queue(context, device, properties, 0, &err);
+ } while (0);
+
+ if (errcode_ret)
+ *errcode_ret = err;
+ return queue;
+}
+
+/* 2.0 new API for create command queue. */
+cl_command_queue
+clCreateCommandQueueWithProperties(cl_context context,
+ cl_device_id device,
+ const cl_queue_properties *properties,
+ cl_int *errcode_ret)
+{
+ cl_command_queue queue = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_command_queue_properties prop = 0xFFFFFFFF;
+ cl_uint queue_sz = 0xFFFFFFFF;
+
+ do {
+ if (!CL_OBJECT_IS_CONTEXT(context)) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ err = cl_devices_list_include_check(context->device_num, context->devices, 1, &device);
+ if (err)
+ break;
+
+ if (properties) {
+ cl_ulong que_type;
+ cl_ulong que_val;
+ cl_uint i;
+ for (i = 0; (que_type = properties[i++]) != 0; i++) {
+ que_val = properties[i];
+ switch (que_type) {
+ case CL_QUEUE_PROPERTIES:
+ if (prop != 0xFFFFFFFF)
+ err = CL_INVALID_VALUE;
+ else {
+ switch (que_val) {
+ case 0:
+ case CL_QUEUE_PROFILING_ENABLE:
+ case CL_QUEUE_PROFILING_ENABLE |
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE:
+ case CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE:
+ case CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE:
+ case CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
+ CL_QUEUE_ON_DEVICE_DEFAULT:
+ case CL_QUEUE_PROFILING_ENABLE |
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE:
+ case CL_QUEUE_PROFILING_ENABLE |
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
+ CL_QUEUE_ON_DEVICE_DEFAULT:
+ prop = que_val;
+ break;
+ default:
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ }
+ break;
+ case CL_QUEUE_SIZE:
+ queue_sz = que_val;
+ break;
+ default:
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ }
+
+ if (err) /* break the while and return some err. */
+ break;
+ }
+
+ /* Set some paramters to default val. */
+ if (prop == 0xFFFFFFFF)
+ prop = 0;
+ if (queue_sz != 0xFFFFFFFF)
+ if (!(prop & CL_QUEUE_ON_DEVICE)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ if (queue_sz == 0xFFFFFFFF)
+ queue_sz = device->queue_on_device_preferred_size;
+
+ if (queue_sz > device->queue_on_device_max_size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ queue = cl_create_command_queue(context, device, prop, queue_sz, &err);
+ } while (0);
+
+ if (errcode_ret)
+ *errcode_ret = err;
+ return queue;
+}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ cl_int ref;
+
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ if (param_name == CL_QUEUE_CONTEXT) {
+ src_ptr = &command_queue->ctx;
+ src_size = sizeof(cl_context);
+ } else if (param_name == CL_QUEUE_DEVICE) {
+ src_ptr = &command_queue->device;
+ src_size = sizeof(cl_device_id);
+ } else if (param_name == CL_QUEUE_REFERENCE_COUNT) {
+ ref = CL_OBJECT_GET_REF(command_queue);
+ src_ptr = &ref;
+ src_size = sizeof(cl_int);
+ } else if (param_name == CL_QUEUE_PROPERTIES) {
+ src_ptr = &command_queue->props;
+ src_size = sizeof(cl_command_queue_properties);
+ } else if (param_name == CL_QUEUE_SIZE) {
+ src_ptr = &command_queue->size;
+ src_size = sizeof(command_queue->size);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
+
+cl_int
+clFlush(cl_command_queue command_queue)
+{
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ return cl_command_queue_wait_flush(command_queue);
+}
+
+cl_int
+clFinish(cl_command_queue command_queue)
+{
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ return cl_command_queue_wait_finish(command_queue);
+}
+
+cl_int
+clRetainCommandQueue(cl_command_queue command_queue)
+{
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+ cl_command_queue_add_ref(command_queue);
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseCommandQueue(cl_command_queue command_queue)
+{
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ cl_command_queue_wait_flush(command_queue);
+
+ cl_command_queue_delete(command_queue);
+ return CL_SUCCESS;
+}
diff --git a/src/cl_api_context.c b/src/cl_api_context.c
new file mode 100644
index 0000000..e8184b1
--- /dev/null
+++ b/src/cl_api_context.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_context.h"
+#include "cl_device_id.h"
+#include "cl_alloc.h"
+
+cl_context
+clCreateContext(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id *devices,
+ void (*pfn_notify)(const char *, const void *, size_t, void *),
+ void *user_data,
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_context context = NULL;
+
+ do {
+ /* Assure parameters correctness */
+ if (devices == NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (num_devices == 0) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (pfn_notify == NULL && user_data != NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_devices_list_check(num_devices, devices);
+ if (err != CL_SUCCESS)
+ break;
+
+ context = cl_create_context(properties, num_devices, devices, pfn_notify, user_data, &err);
+ } while (0);
+
+ if (errcode_ret)
+ *errcode_ret = err;
+ return context;
+}
+
+cl_context
+clCreateContextFromType(const cl_context_properties *properties,
+ cl_device_type device_type,
+ void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+ void *user_data,
+ cl_int *errcode_ret)
+{
+ cl_context context = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_device_id *devices = NULL;
+ cl_uint num_devices = 0;
+ const cl_device_type valid_type = CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_ACCELERATOR |
+ CL_DEVICE_TYPE_DEFAULT | CL_DEVICE_TYPE_CUSTOM;
+
+ do {
+ /* Assure parameters correctness */
+ if (pfn_notify == NULL && user_data != NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if ((device_type & valid_type) == 0) {
+ err = CL_INVALID_DEVICE_TYPE;
+ break;
+ }
+
+ /* Get the devices num first. */
+ err = cl_get_device_ids(NULL, device_type, 0, NULL, &num_devices);
+ if (err != CL_SUCCESS)
+ break;
+
+ assert(num_devices > 0);
+ devices = cl_malloc(num_devices * sizeof(cl_device_id));
+ err = cl_get_device_ids(NULL, device_type, num_devices, &devices[0], &num_devices);
+ if (err != CL_SUCCESS)
+ break;
+
+ context = cl_create_context(properties, num_devices, devices, pfn_notify, user_data, &err);
+ } while (0);
+
+ if (devices)
+ cl_free(devices);
+ if (errcode_ret)
+ *errcode_ret = err;
+ return context;
+}
+
+cl_int
+clRetainContext(cl_context context)
+{
+ if (!CL_OBJECT_IS_CONTEXT(context)) {
+ return CL_INVALID_CONTEXT;
+ }
+
+ cl_context_add_ref(context);
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseContext(cl_context context)
+{
+ if (!CL_OBJECT_IS_CONTEXT(context)) {
+ return CL_INVALID_CONTEXT;
+ }
+
+ cl_context_delete(context);
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetContextInfo(cl_context context,
+ cl_context_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ cl_uint n, ref;
+ cl_context_properties p;
+
+ if (!CL_OBJECT_IS_CONTEXT(context)) {
+ return CL_INVALID_CONTEXT;
+ }
+
+ if (param_name == CL_CONTEXT_DEVICES) {
+ src_ptr = context->devices;
+ src_size = sizeof(cl_device_id) * context->device_num;
+ } else if (param_name == CL_CONTEXT_NUM_DEVICES) {
+ n = context->device_num;
+ src_ptr = &n;
+ src_size = sizeof(cl_uint);
+ } else if (param_name == CL_CONTEXT_REFERENCE_COUNT) {
+ ref = CL_OBJECT_GET_REF(context);
+ src_ptr = &ref;
+ src_size = sizeof(cl_uint);
+ } else if (param_name == CL_CONTEXT_PROPERTIES) {
+ if (context->prop_len > 0) {
+ src_ptr = context->prop_user;
+ src_size = sizeof(cl_context_properties) * context->prop_len;
+ } else {
+ p = 0;
+ src_ptr = &p;
+ src_size = sizeof(cl_context_properties);
+ }
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
diff --git a/src/cl_api_device_id.c b/src/cl_api_device_id.c
new file mode 100644
index 0000000..4ffef78
--- /dev/null
+++ b/src/cl_api_device_id.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_device_id.h"
+#include "cl_platform_id.h"
+
+cl_int
+clGetDeviceIDs(cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id *devices,
+ cl_uint *num_devices)
+{
+ const cl_device_type valid_type = CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU |
+ CL_DEVICE_TYPE_ACCELERATOR | CL_DEVICE_TYPE_DEFAULT |
+ CL_DEVICE_TYPE_CUSTOM;
+
+ /* Check parameter consistency */
+ if (UNLIKELY(devices == NULL && num_devices == NULL))
+ return CL_INVALID_VALUE;
+ if (UNLIKELY(platform && platform != cl_get_platform_default()))
+ return CL_INVALID_PLATFORM;
+ if (UNLIKELY(devices && num_entries == 0))
+ return CL_INVALID_VALUE;
+ if ((device_type & valid_type) == 0)
+ return CL_INVALID_DEVICE_TYPE;
+
+ return cl_get_device_ids(platform, device_type, num_entries, devices, num_devices);
+}
+
+cl_int
+clGetDeviceInfo(cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ if (!CL_OBJECT_IS_DEVICE(device)) {
+ return CL_INVALID_DEVICE;
+ }
+
+ return cl_get_device_info(device, param_name, param_value_size,
+ param_value, param_value_size_ret);
+}
+
+cl_int
+clRetainDevice(cl_device_id device)
+{
+ // XXX stub for C++ Bindings
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseDevice(cl_device_id device)
+{
+ // XXX stub for C++ Bindings
+ return CL_SUCCESS;
+}
+
+cl_int
+clCreateSubDevices(cl_device_id in_device,
+ const cl_device_partition_property *properties,
+ cl_uint num_devices,
+ cl_device_id *out_devices,
+ cl_uint *num_devices_ret)
+{
+ /* Check parameter consistency */
+ if (UNLIKELY(out_devices == NULL && num_devices_ret == NULL))
+ return CL_INVALID_VALUE;
+ if (UNLIKELY(in_device == NULL && properties == NULL))
+ return CL_INVALID_VALUE;
+
+ *num_devices_ret = 0;
+ return CL_INVALID_DEVICE_PARTITION_COUNT;
+}
diff --git a/src/cl_api_event.c b/src/cl_api_event.c
new file mode 100644
index 0000000..5f3a116
--- /dev/null
+++ b/src/cl_api_event.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "CL/cl.h"
+#include <stdio.h>
+
+cl_event
+clCreateUserEvent(cl_context context,
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_event event = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_CONTEXT(context)) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ event = cl_event_create(context, NULL, 0, NULL, CL_COMMAND_USER, &err);
+ } while (0);
+
+ if (errcode_ret)
+ *errcode_ret = err;
+ return event;
+}
+
+cl_int
+clSetUserEventStatus(cl_event event,
+ cl_int execution_status)
+{
+ cl_int err = CL_SUCCESS;
+
+ if (!CL_OBJECT_IS_EVENT(event)) {
+ return CL_INVALID_EVENT;
+ }
+
+ if (execution_status > CL_COMPLETE) {
+ return CL_INVALID_VALUE;
+ }
+
+ err = cl_event_set_status(event, execution_status);
+ return err;
+}
+
+/* 1.1 API, depreciated */
+cl_int
+clEnqueueMarker(cl_command_queue command_queue,
+ cl_event *event)
+{
+ return clEnqueueMarkerWithWaitList(command_queue, 0, NULL, event);
+}
+
+cl_int
+clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ cl_event e = NULL;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (event == NULL) { /* Create a anonymous event, it can not be waited on and useless. */
+ return CL_SUCCESS;
+ }
+
+ e = cl_event_create_marker_or_barrier(command_queue, num_events_in_wait_list,
+ event_wait_list, CL_FALSE, &err);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ } else if (e_status == CL_COMPLETE) {
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+ return err;
+}
+
+/* 1.1 API, depreciated */
+cl_int
+clEnqueueBarrier(cl_command_queue command_queue)
+{
+ return clEnqueueBarrierWithWaitList(command_queue, 0, NULL, NULL);
+}
+
+cl_int
+clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ cl_event e = NULL;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create_marker_or_barrier(command_queue, num_events_in_wait_list,
+ event_wait_list, CL_TRUE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ } else if (e_status == CL_COMPLETE) {
+ cl_command_queue_insert_barrier_event(command_queue, e);
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ /* Already a completed barrier, no need to insert to queue. */
+ } else {
+ cl_command_queue_insert_barrier_event(command_queue, e);
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+ return err;
+}
+
+cl_int
+clWaitForEvents(cl_uint num_events,
+ const cl_event *event_list)
+{
+ cl_int err = CL_SUCCESS;
+ cl_uint i;
+
+ if (num_events == 0 || event_list == NULL) {
+ return CL_INVALID_VALUE;
+ }
+
+ err = cl_event_check_waitlist(num_events, event_list, NULL, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ for (i = 0; i < num_events; i++) {
+ if (cl_event_get_status(event_list[i]) < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ return err;
+ }
+ }
+
+ err = cl_event_wait_for_events_list(num_events, event_list);
+ return err;
+}
+
+/* 1.1 API, depreciated */
+cl_int
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+ cl_uint num_events,
+ const cl_event *event_list)
+{
+ cl_int err = CL_SUCCESS;
+
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ err = clWaitForEvents(num_events, event_list);
+ return err;
+}
+
+cl_int
+clSetEventCallback(cl_event event,
+ cl_int command_exec_callback_type,
+ void(CL_CALLBACK *pfn_notify)(cl_event, cl_int, void *),
+ void *user_data)
+{
+ cl_int err = CL_SUCCESS;
+
+ if (!CL_OBJECT_IS_EVENT(event)) {
+ return CL_INVALID_EVENT;
+ }
+
+ if ((pfn_notify == NULL) ||
+ (command_exec_callback_type > CL_SUBMITTED) ||
+ (command_exec_callback_type < CL_COMPLETE)) {
+ return CL_INVALID_VALUE;
+ }
+
+ err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
+ return err;
+}
+
+cl_int
+clGetEventInfo(cl_event event,
+ cl_event_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ void *src_ptr = NULL;
+ size_t src_size = 0;
+ cl_uint ref;
+ cl_int status;
+
+ if (!CL_OBJECT_IS_EVENT(event)) {
+ return CL_INVALID_EVENT;
+ }
+
+ if (param_name == CL_EVENT_COMMAND_QUEUE) {
+ src_ptr = &event->queue;
+ src_size = sizeof(cl_command_queue);
+ } else if (param_name == CL_EVENT_CONTEXT) {
+ src_ptr = &event->ctx;
+ src_size = sizeof(cl_context);
+ } else if (param_name == CL_EVENT_COMMAND_TYPE) {
+ src_ptr = &event->event_type;
+ src_size = sizeof(cl_command_type);
+ } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
+ status = cl_event_get_status(event);
+ src_ptr = &status;
+ src_size = sizeof(cl_int);
+ } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
+ ref = CL_OBJECT_GET_REF(event);
+ src_ptr = &ref;
+ src_size = sizeof(cl_int);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event event,
+ cl_profiling_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ cl_ulong ret_val;
+
+ if (!CL_OBJECT_IS_EVENT(event)) {
+ return CL_INVALID_EVENT;
+ }
+
+ assert(event->event_type == CL_COMMAND_USER || event->queue != NULL);
+ if (event->event_type == CL_COMMAND_USER ||
+ !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
+ cl_event_get_status(event) != CL_COMPLETE) {
+ return CL_PROFILING_INFO_NOT_AVAILABLE;
+ }
+
+ if (param_value && param_value_size < sizeof(cl_ulong)) {
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_name < CL_PROFILING_COMMAND_QUEUED ||
+ param_name > CL_PROFILING_COMMAND_COMPLETE) {
+ return CL_INVALID_VALUE;
+ }
+
+ ret_val = event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED];
+ if (ret_val == CL_EVENT_INVALID_TIMESTAMP) {
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value)
+ *(cl_ulong *)param_value = ret_val;
+ if (param_value_size_ret)
+ *param_value_size_ret = sizeof(cl_ulong);
+ return CL_SUCCESS;
+}
diff --git a/src/cl_api_kernel.c b/src/cl_api_kernel.c
new file mode 100644
index 0000000..13ea8c0
--- /dev/null
+++ b/src/cl_api_kernel.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_mem.h"
+#include "cl_kernel.h"
+#include "cl_enqueue.h"
+#include "cl_command_queue.h"
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_alloc.h"
+#include "CL/cl.h"
+#include <stdio.h>
+#include <string.h>
+
+cl_int
+clGetKernelInfo(cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ const char *str = NULL;
+ cl_int ref;
+ cl_uint n;
+
+ if (!CL_OBJECT_IS_KERNEL(kernel)) {
+ return CL_INVALID_KERNEL;
+ }
+
+ if (param_name == CL_KERNEL_CONTEXT) {
+ src_ptr = &kernel->program->ctx;
+ src_size = sizeof(cl_context);
+ } else if (param_name == CL_KERNEL_PROGRAM) {
+ src_ptr = &kernel->program;
+ src_size = sizeof(cl_program);
+ } else if (param_name == CL_KERNEL_NUM_ARGS) {
+ n = kernel->arg_n;
+ src_ptr = &n;
+ src_size = sizeof(cl_uint);
+ } else if (param_name == CL_KERNEL_REFERENCE_COUNT) {
+ ref = CL_OBJECT_GET_REF(kernel);
+ src_ptr = &ref;
+ src_size = sizeof(cl_int);
+ } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
+ str = cl_kernel_get_name(kernel);
+ src_ptr = str;
+ src_size = strlen(str) + 1;
+ } else if (param_name == CL_KERNEL_ATTRIBUTES) {
+ str = cl_kernel_get_attributes(kernel);
+ src_ptr = str;
+ src_size = strlen(str) + 1;
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ size_t fixed_global_off[] = {0, 0, 0};
+ size_t fixed_global_sz[] = {1, 1, 1};
+ size_t fixed_local_sz[] = {1, 1, 1};
+ cl_int err = CL_SUCCESS;
+ cl_uint i;
+ cl_event e = NULL;
+ cl_int event_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_KERNEL(kernel)) {
+ err = CL_INVALID_KERNEL;
+ break;
+ }
+
+ /* Check number of dimensions we have */
+ if (UNLIKELY(work_dim == 0 || work_dim > 3)) {
+ err = CL_INVALID_WORK_DIMENSION;
+ break;
+ }
+
+ /* We need a work size per dimension */
+ if (UNLIKELY(global_work_size == NULL)) {
+ err = CL_INVALID_GLOBAL_WORK_SIZE;
+ break;
+ }
+
+ if (kernel->vme) {
+ if (work_dim != 2) {
+ err = CL_INVALID_WORK_DIMENSION;
+ break;
+ }
+ if (local_work_size != NULL) {
+ err = CL_INVALID_WORK_GROUP_SIZE;
+ break;
+ }
+ }
+
+ if (global_work_offset != NULL) {
+ for (i = 0; i < work_dim; ++i) {
+ if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
+ err = CL_INVALID_GLOBAL_OFFSET;
+ break;
+ }
+ }
+ }
+
+ /* Queue and kernel must share the same context */
+ assert(kernel->program);
+ if (command_queue->ctx != kernel->program->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (local_work_size != NULL) {
+ for (i = 0; i < work_dim; ++i)
+ fixed_local_sz[i] = local_work_size[i];
+ } else {
+ if (kernel->vme) {
+ fixed_local_sz[0] = 16;
+ fixed_local_sz[1] = 1;
+ } else {
+ uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+ size_t realGroupSize = 1;
+ for (i = 0; i < work_dim; i++) {
+ for (j = maxDimSize; j > 1; j--) {
+ if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+ fixed_local_sz[i] = j;
+ maxGroupSize = maxGroupSize / j;
+ maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+ break; //choose next work_dim
+ }
+ }
+ realGroupSize *= fixed_local_sz[i];
+ }
+
+ //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop:
+ //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
+ //it triggers the following message for many times.
+ //to avoid too many messages, only print it for the first time of the process.
+ //just use static variable since it doesn't matter to print a few times at multi-thread case.
+ static int warn_no_good_localsize = 1;
+ if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
+ warn_no_good_localsize = 0;
+ DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide\n"
+ " local_work_size[] explicitly, you can find good values with\n"
+ " trial-and-error method.");
+ }
+ }
+ }
+
+ if (kernel->vme) {
+ fixed_global_sz[0] = (global_work_size[0] + 15) / 16 * 16;
+ fixed_global_sz[1] = (global_work_size[1] + 15) / 16;
+ } else {
+ for (i = 0; i < work_dim; ++i)
+ fixed_global_sz[i] = global_work_size[i];
+ }
+
+ if (global_work_offset != NULL)
+ for (i = 0; i < work_dim; ++i)
+ fixed_global_off[i] = global_work_offset[i];
+
+ if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
+ if (fixed_local_sz[0] != kernel->compile_wg_sz[0] ||
+ fixed_local_sz[1] != kernel->compile_wg_sz[1] ||
+ fixed_local_sz[2] != kernel->compile_wg_sz[2]) {
+ err = CL_INVALID_WORK_GROUP_SIZE;
+ break;
+ }
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ int i, j, k;
+ const size_t global_wk_sz_div[3] = {
+ fixed_global_sz[0] / fixed_local_sz[0] * fixed_local_sz[0],
+ fixed_global_sz[1] / fixed_local_sz[1] * fixed_local_sz[1],
+ fixed_global_sz[2] / fixed_local_sz[2] * fixed_local_sz[2]};
+
+ const size_t global_wk_sz_rem[3] = {
+ fixed_global_sz[0] % fixed_local_sz[0],
+ fixed_global_sz[1] % fixed_local_sz[1],
+ fixed_global_sz[2] % fixed_local_sz[2]};
+ cl_uint count;
+ count = global_wk_sz_rem[0] ? 2 : 1;
+ count *= global_wk_sz_rem[1] ? 2 : 1;
+ count *= global_wk_sz_rem[2] ? 2 : 1;
+
+ const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+ /* Go through the at most 8 cases and euque if there is work items left */
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ for (k = 0; k < 2; k++) {
+ size_t global_wk_sz_use[3] = {global_wk_all[k][0], global_wk_all[j][1], global_wk_all[i][2]};
+ size_t global_dim_off[3] = {
+ k * global_wk_sz_div[0] / fixed_local_sz[0],
+ j * global_wk_sz_div[1] / fixed_local_sz[1],
+ i * global_wk_sz_div[2] / fixed_local_sz[2]};
+ size_t local_wk_sz_use[3] = {
+ k ? global_wk_sz_rem[0] : fixed_local_sz[0],
+ j ? global_wk_sz_rem[1] : fixed_local_sz[1],
+ i ? global_wk_sz_rem[2] : fixed_local_sz[2]};
+ if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0)
+ continue;
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_NDRANGE_KERNEL, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ /* Do device specific checks are enqueue the kernel */
+ err = cl_command_queue_ND_range(command_queue, kernel, e, work_dim,
+ fixed_global_off, global_dim_off, fixed_global_sz,
+ global_wk_sz_use, fixed_local_sz, local_wk_sz_use);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ e->exec_data.mid_event_of_enq = (count > 1);
+ count--;
+
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ event_status = cl_event_is_ready(e);
+ if (event_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+
+ if (e->exec_data.mid_event_of_enq)
+ cl_event_delete(e);
+ }
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueTask(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ const size_t global_size[3] = {1, 0, 0};
+ const size_t local_size[3] = {1, 0, 0};
+
+ return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+ global_size, local_size,
+ num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int
+clEnqueueNativeKernel(cl_command_queue command_queue,
+ void (*user_func)(void *),
+ void *args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const cl_mem *mem_list,
+ const void **args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ void *new_args = NULL;
+ void **new_args_mem_loc = NULL;
+ cl_mem *new_mem_list = NULL;
+ cl_int i;
+ cl_int e_status;
+ cl_event e = NULL;
+ enqueue_data *data = NULL;
+
+ do {
+ if (user_func == NULL ||
+ (args == NULL && cb_args > 0) ||
+ (args == NULL && num_mem_objects > 0) ||
+ (args != NULL && cb_args == 0) ||
+ (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
+ (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ //Per spec, need copy args
+ if (cb_args) {
+ new_args = cl_malloc(cb_args);
+ if (num_mem_objects) {
+ new_args_mem_loc = cl_malloc(sizeof(void *) * num_mem_objects);
+ new_mem_list = cl_malloc(sizeof(cl_mem) * num_mem_objects);
+ memcpy(new_mem_list, mem_list, sizeof(cl_mem) * num_mem_objects);
+ }
+
+ if (new_args == NULL || new_args_mem_loc == NULL) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ break;
+ }
+ memcpy(new_args, args, cb_args);
+
+ for (i = 0; i < num_mem_objects; ++i) {
+ if (!CL_OBJECT_IS_MEM(mem_list[i])) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ new_args_mem_loc[i] = new_args + (args_mem_loc[i] - args); //change to new args
+ }
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_NATIVE_KERNEL, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueNativeKernel;
+ data->mem_list = new_mem_list;
+ data->ptr = new_args;
+ data->size = cb_args;
+ data->offset = (size_t)num_mem_objects;
+ data->const_ptr = new_args_mem_loc;
+ data->user_func = user_func;
+ new_args = NULL;
+ new_mem_list = NULL;
+ new_args_mem_loc = NULL; // Event delete will free them.
+
+ err = cl_event_exec(e, (e_status == CL_COMPLETE ? CL_COMPLETE : CL_QUEUED), CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (e_status != CL_COMPLETE)
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err != CL_SUCCESS) {
+ if (new_args)
+ cl_free(new_args);
+ if (new_mem_list)
+ cl_free(new_mem_list);
+ if (new_args_mem_loc)
+ cl_free(new_args_mem_loc);
+ }
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
new file mode 100644
index 0000000..0d19bf8
--- /dev/null
+++ b/src/cl_api_mem.c
@@ -0,0 +1,2435 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_mem.h"
+#include "cl_enqueue.h"
+#include "cl_command_queue.h"
+#include "cl_event.h"
+#include "CL/cl.h"
+
+cl_int
+clSetMemObjectDestructorCallback(cl_mem memobj,
+ void(CL_CALLBACK *pfn_notify)(cl_mem, void *),
+ void *user_data)
+{
+ if (!CL_OBJECT_IS_MEM(memobj))
+ return CL_INVALID_MEM_OBJECT;
+
+ if (pfn_notify == NULL)
+ return CL_INVALID_VALUE;
+
+ return cl_mem_set_destructor_callback(memobj, pfn_notify, user_data);
+}
+
+cl_int
+clGetMemObjectInfo(cl_mem memobj,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ cl_mem_object_type type;
+ size_t ptr, offset;
+ cl_int ref;
+ cl_mem parent;
+
+ if (!CL_OBJECT_IS_MEM(memobj)) {
+ return CL_INVALID_MEM_OBJECT;
+ }
+
+ switch (param_name) {
+ case CL_MEM_TYPE: {
+ type = cl_get_mem_object_type(memobj);
+ src_ptr = &type;
+ src_size = sizeof(cl_mem_object_type);
+ break;
+ }
+ case CL_MEM_FLAGS:
+ src_ptr = &memobj->flags;
+ src_size = sizeof(cl_mem_flags);
+ break;
+ case CL_MEM_SIZE:
+ src_ptr = &memobj->size;
+ src_size = sizeof(size_t);
+ break;
+ case CL_MEM_HOST_PTR: {
+ ptr = 0;
+ if (memobj->type == CL_MEM_IMAGE_TYPE) {
+ ptr = (size_t)memobj->host_ptr;
+ } else {
+ struct _cl_mem_buffer *buf = (struct _cl_mem_buffer *)memobj;
+ ptr = (size_t)memobj->host_ptr + buf->sub_offset;
+ }
+ src_ptr = &ptr;
+ src_size = sizeof(size_t);
+ break;
+ }
+ case CL_MEM_USES_SVM_POINTER: {
+ src_ptr = &memobj->is_svm;
+ src_size = sizeof(memobj->is_svm);
+ break;
+ }
+ case CL_MEM_MAP_COUNT:
+ src_ptr = &memobj->map_ref;
+ src_size = sizeof(cl_uint);
+ break;
+ case CL_MEM_REFERENCE_COUNT: {
+ ref = CL_OBJECT_GET_REF(memobj);
+ src_ptr = &ref;
+ src_size = sizeof(cl_int);
+ break;
+ }
+ case CL_MEM_CONTEXT:
+ src_ptr = &memobj->ctx;
+ src_size = sizeof(cl_context);
+ break;
+ case CL_MEM_ASSOCIATED_MEMOBJECT: {
+ parent = NULL;
+ if (memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+ struct _cl_mem_buffer *buf = (struct _cl_mem_buffer *)memobj;
+ parent = (cl_mem)(buf->parent);
+ } else if (memobj->type == CL_MEM_IMAGE_TYPE) {
+ parent = memobj;
+ } else if (memobj->type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
+ struct _cl_mem_buffer1d_image *image_buffer = (struct _cl_mem_buffer1d_image *)memobj;
+ parent = image_buffer->descbuffer;
+ } else
+ parent = NULL;
+ src_ptr = &parent;
+ src_size = sizeof(cl_mem);
+ break;
+ }
+ case CL_MEM_OFFSET: {
+ offset = 0;
+ if (memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+ struct _cl_mem_buffer *buf = (struct _cl_mem_buffer *)memobj;
+ offset = buf->sub_offset;
+ }
+ src_ptr = &offset;
+ src_size = sizeof(size_t);
+ break;
+ }
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
+
+cl_int
+clGetImageInfo(cl_mem memobj,
+ cl_image_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ struct _cl_mem_image *image;
+ size_t height, depth, array_sz;
+ cl_uint value;
+
+ if (!CL_OBJECT_IS_MEM(memobj)) {
+ return CL_INVALID_MEM_OBJECT;
+ }
+ image = cl_mem_image(memobj);
+
+ switch (param_name) {
+ case CL_IMAGE_FORMAT:
+ src_ptr = &image->fmt;
+ src_size = sizeof(cl_image_format);
+ break;
+ case CL_IMAGE_ELEMENT_SIZE:
+ src_ptr = &image->bpp;
+ src_size = sizeof(size_t);
+ break;
+ case CL_IMAGE_ROW_PITCH:
+ src_ptr = &image->row_pitch;
+ src_size = sizeof(size_t);
+ break;
+ case CL_IMAGE_SLICE_PITCH:
+ src_ptr = &image->slice_pitch;
+ src_size = sizeof(size_t);
+ break;
+ case CL_IMAGE_WIDTH:
+ if (memobj->type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
+ struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image *)image;
+ src_ptr = &buffer1d_image->size;
+ } else {
+ src_ptr = &image->w;
+ }
+ src_size = sizeof(size_t);
+ break;
+ case CL_IMAGE_HEIGHT: {
+ height = 0;
+ if (memobj->type != CL_MEM_BUFFER1D_IMAGE_TYPE) {
+ height = IS_1D_IMAGE(image) ? 0 : image->h;
+ }
+ src_ptr = &height;
+ src_size = sizeof(size_t);
+ break;
+ }
+ case CL_IMAGE_DEPTH: {
+ depth = 0;
+ depth = IS_3D_IMAGE(image) ? image->depth : 0;
+ src_ptr = &depth;
+ src_size = sizeof(size_t);
+ break;
+ }
+ case CL_IMAGE_ARRAY_SIZE: {
+ array_sz = 0;
+ array_sz = IS_IMAGE_ARRAY(image) ? image->depth : 0;
+ src_ptr = &array_sz;
+ src_size = sizeof(size_t);
+ break;
+ }
+ case CL_IMAGE_BUFFER:
+ src_ptr = &image->buffer_1d;
+ src_size = sizeof(cl_mem);
+ break;
+ case CL_IMAGE_NUM_MIP_LEVELS:
+ case CL_IMAGE_NUM_SAMPLES: {
+ value = 0;
+ src_ptr = &value;
+ src_size = sizeof(cl_uint);
+ break;
+ }
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
+
+void *
+clEnqueueMapBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event,
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ void *ptr = NULL;
+ void *mem_ptr = NULL;
+ cl_event e = NULL;
+ cl_int e_status;
+ enqueue_data *data = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_BUFFER(buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (!size || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if ((map_flags & CL_MAP_READ &&
+ buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+ (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+ buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_MAP_BUFFER, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_map) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueMapBuffer;
+ data->mem_obj = buffer;
+ data->offset = offset;
+ data->size = size;
+ data->ptr = NULL;
+ data->unsync_map = 0;
+ if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
+ data->write_map = 1;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+
+ ptr = data->ptr;
+ assert(ptr);
+ err = cl_mem_record_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
+ assert(err == CL_SUCCESS);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ if (errcode_ret)
+ *errcode_ret = err;
+
+ return mem_ptr;
+}
+
+cl_int
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+ cl_mem memobj,
+ void *mapped_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ cl_int e_status;
+ enqueue_data *data = NULL;
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_MEM(memobj)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != memobj->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_UNMAP_MEM_OBJECT, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueUnmapMemObject;
+ data->mem_obj = memobj;
+ data->ptr = mapped_ptr;
+
+ if (e_status == CL_COMPLETE) { // No need to wait
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else { // May need to wait some event to complete.
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueReadBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t size,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data = NULL;
+ cl_int e_status;
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_BUFFER(buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (!ptr || !size || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_READ_BUFFER, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_read) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueReadBuffer;
+ data->mem_obj = buffer;
+ data->ptr = ptr;
+ data->offset = offset;
+ data->size = size;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueWriteBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t size,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data = NULL;
+ cl_int e_status;
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_BUFFER(buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (!ptr || !size || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_WRITE_BUFFER, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_write) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueWriteBuffer;
+ data->mem_obj = buffer;
+ data->const_ptr = ptr;
+ data->offset = offset;
+ data->size = size;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueReadBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ const size_t *buffer_origin,
+ const size_t *host_origin,
+ const size_t *region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ size_t total_size = 0;
+ enqueue_data *data = NULL;
+ cl_int e_status;
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_BUFFER(buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (buffer_row_pitch == 0)
+ buffer_row_pitch = region[0];
+ if (buffer_slice_pitch == 0)
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+ if (host_row_pitch == 0)
+ host_row_pitch = region[0];
+ if (host_slice_pitch == 0)
+ host_slice_pitch = region[1] * host_row_pitch;
+
+ if (buffer_row_pitch < region[0] ||
+ host_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0) ||
+ (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ total_size = (buffer_origin[2] + region[2] - 1) * buffer_slice_pitch +
+ (buffer_origin[1] + region[1] - 1) * buffer_row_pitch + buffer_origin[0] + region[0];
+ if (total_size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_READ_BUFFER_RECT, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_read) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueReadBufferRect;
+ data->mem_obj = buffer;
+ data->ptr = ptr;
+ data->origin[0] = buffer_origin[0];
+ data->origin[1] = buffer_origin[1];
+ data->origin[2] = buffer_origin[2];
+ data->host_origin[0] = host_origin[0];
+ data->host_origin[1] = host_origin[1];
+ data->host_origin[2] = host_origin[2];
+ data->region[0] = region[0];
+ data->region[1] = region[1];
+ data->region[2] = region[2];
+ data->row_pitch = buffer_row_pitch;
+ data->slice_pitch = buffer_slice_pitch;
+ data->host_row_pitch = host_row_pitch;
+ data->host_slice_pitch = host_slice_pitch;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueWriteBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ const size_t *buffer_origin,
+ const size_t *host_origin,
+ const size_t *region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ size_t total_size = 0;
+ enqueue_data *data = NULL;
+ cl_int e_status;
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_BUFFER(buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (buffer_row_pitch == 0)
+ buffer_row_pitch = region[0];
+ if (buffer_slice_pitch == 0)
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+ if (host_row_pitch == 0)
+ host_row_pitch = region[0];
+ if (host_slice_pitch == 0)
+ host_slice_pitch = region[1] * host_row_pitch;
+
+ if (buffer_row_pitch < region[0] ||
+ host_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0) ||
+ (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ total_size = (buffer_origin[2] + region[2] - 1) * buffer_slice_pitch +
+ (buffer_origin[1] + region[1] - 1) * buffer_row_pitch +
+ buffer_origin[0] + region[0];
+
+ if (total_size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_WRITE_BUFFER_RECT, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_write) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueWriteBufferRect;
+ data->mem_obj = buffer;
+ data->const_ptr = ptr;
+ data->origin[0] = buffer_origin[0];
+ data->origin[1] = buffer_origin[1];
+ data->origin[2] = buffer_origin[2];
+ data->host_origin[0] = host_origin[0];
+ data->host_origin[1] = host_origin[1];
+ data->host_origin[2] = host_origin[2];
+ data->region[0] = region[0];
+ data->region[1] = region[1];
+ data->region[2] = region[2];
+ data->row_pitch = buffer_row_pitch;
+ data->slice_pitch = buffer_slice_pitch;
+ data->host_row_pitch = host_row_pitch;
+ data->host_slice_pitch = host_slice_pitch;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueCopyBuffer(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ cl_event e = NULL;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_MEM(src_buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (!CL_OBJECT_IS_MEM(dst_buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != src_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+ if (command_queue->ctx != dst_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (src_offset + cb > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ if (dst_offset + cb > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ /* Check overlap */
+ if (src_buffer == dst_buffer && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1) &&
+ (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
+ err = CL_MEM_COPY_OVERLAP;
+ break;
+ }
+
+ /* Check sub overlap */
+ if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE) {
+ struct _cl_mem_buffer *src_b = (struct _cl_mem_buffer *)src_buffer;
+ struct _cl_mem_buffer *dst_b = (struct _cl_mem_buffer *)dst_buffer;
+ size_t src_sub_offset = src_b->sub_offset;
+ size_t dst_sub_offset = dst_b->sub_offset;
+ if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset &&
+ dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1) &&
+ (dst_offset + dst_sub_offset <= src_offset + src_sub_offset &&
+ src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) {
+ err = CL_MEM_COPY_OVERLAP;
+ break;
+ }
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_COPY_BUFFER, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_mem_copy(command_queue, e, src_buffer, dst_buffer, src_offset, dst_offset, cb);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+/* The following code checking overlap is from Appendix of openCL spec 1.1 */
+static cl_bool
+check_copy_overlap(const size_t src_offset[3],
+ const size_t dst_offset[3],
+ const size_t region[3],
+ size_t row_pitch, size_t slice_pitch)
+{
+ const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+ const size_t src_max[] = {src_offset[0] + region[0],
+ src_offset[1] + region[1],
+ src_offset[2] + region[2]};
+ const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+ const size_t dst_max[] = {dst_offset[0] + region[0],
+ dst_offset[1] + region[1],
+ dst_offset[2] + region[2]};
+ // Check for overlap
+ cl_bool overlap = CL_TRUE;
+ unsigned i;
+ size_t dst_start = dst_offset[2] * slice_pitch +
+ dst_offset[1] * row_pitch + dst_offset[0];
+ size_t dst_end = dst_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+ size_t src_start = src_offset[2] * slice_pitch +
+ src_offset[1] * row_pitch + src_offset[0];
+ size_t src_end = src_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+
+ for (i = 0; i != 3; ++i) {
+ overlap = overlap && (src_min[i] < dst_max[i]) && (src_max[i] > dst_min[i]);
+ }
+
+ if (!overlap) {
+ size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ? src_offset[0] + region[0] - row_pitch : 0;
+ size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ? dst_offset[0] + region[0] - row_pitch : 0;
+ if ((delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+ (delta_dst_x > 0 && delta_dst_x > src_offset[0])) {
+ if ((src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end))
+ overlap = CL_TRUE;
+ }
+ if (region[2] > 1) {
+ size_t src_height = slice_pitch / row_pitch;
+ size_t dst_height = slice_pitch / row_pitch;
+ size_t delta_src_y = (src_offset[1] + region[1] > src_height) ? src_offset[1] + region[1] - src_height : 0;
+ size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ? dst_offset[1] + region[1] - dst_height : 0;
+ if ((delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+ (delta_dst_y > 0 && delta_dst_y > src_offset[1])) {
+ if ((src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end))
+ overlap = CL_TRUE;
+ }
+ }
+ }
+ return overlap;
+}
+
+cl_int
+clEnqueueCopyBufferRect(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ const size_t *src_origin,
+ const size_t *dst_origin,
+ const size_t *region,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ cl_event e = NULL;
+ size_t total_size = 0;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_MEM(src_buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (!CL_OBJECT_IS_MEM(dst_buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if ((command_queue->ctx != src_buffer->ctx) ||
+ (command_queue->ctx != dst_buffer->ctx)) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (src_row_pitch == 0)
+ src_row_pitch = region[0];
+ if (src_slice_pitch == 0)
+ src_slice_pitch = region[1] * src_row_pitch;
+
+ if (dst_row_pitch == 0)
+ dst_row_pitch = region[0];
+ if (dst_slice_pitch == 0)
+ dst_slice_pitch = region[1] * dst_row_pitch;
+
+ if (src_row_pitch < region[0] ||
+ dst_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0) ||
+ (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ total_size = (src_origin[2] + region[2] - 1) * src_slice_pitch +
+ (src_origin[1] + region[1] - 1) * src_row_pitch + src_origin[0] + region[0];
+ if (total_size > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ total_size = (dst_origin[2] + region[2] - 1) * dst_slice_pitch +
+ (dst_origin[1] + region[1] - 1) * dst_row_pitch + dst_origin[0] + region[0];
+ if (total_size > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (src_buffer == dst_buffer &&
+ (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (src_buffer == dst_buffer &&
+ check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+ err = CL_MEM_COPY_OVERLAP;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_COPY_BUFFER_RECT, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_mem_copy_buffer_rect(command_queue, e, src_buffer, dst_buffer, src_origin, dst_origin, region,
+ src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ } else if (e_status == CL_COMPLETE) {
+ err = cl_event_exec(e, CL_SUBMITTED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueFillBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ const void *pattern,
+ size_t pattern_size,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+ int i = 0;
+ cl_event e = NULL;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_BUFFER(buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (pattern == NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
+ if (valid_sz[i] == pattern_size)
+ break;
+ }
+ if (i == sizeof(valid_sz) / sizeof(size_t)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (offset % pattern_size || size % pattern_size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_FILL_BUFFER, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_mem_fill(command_queue, e, pattern, pattern_size, buffer, offset, size);
+ if (err) {
+ break;
+ }
+
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+ cl_uint num_mem_objects,
+ const cl_mem *mem_objects,
+ cl_mem_migration_flags flags,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ /* So far, we just support 1 device and no subdevice. So all the command queues
+ belong to the small context. There is no need to migrate the mem objects by now. */
+ cl_int err = CL_SUCCESS;
+ cl_event e = NULL;
+ cl_int e_status;
+ cl_uint i = 0;
+
+ do {
+ if (!flags & CL_MIGRATE_MEM_OBJECT_HOST) {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+ }
+
+ if (num_mem_objects == 0 || mem_objects == NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ for (i = 0; i < num_mem_objects; i++) {
+ if (!CL_OBJECT_IS_MEM(mem_objects[i])) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (mem_objects[i]->ctx != command_queue->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+ }
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_MIGRATE_MEM_OBJECTS, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ /* Noting to do now, just enqueue a event. */
+ e->exec_data.type = EnqueueMigrateMemObj;
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+/************************************ Images *********************************************/
+static cl_int
+check_image_region(struct _cl_mem_image *image, const size_t *pregion, size_t *region)
+{
+ if (pregion == NULL) {
+ return CL_INVALID_VALUE;
+ }
+
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ region[0] = pregion[0];
+ region[1] = 1;
+ region[2] = pregion[1];
+ } else {
+ region[0] = pregion[0];
+ region[1] = pregion[1];
+ region[2] = pregion[2];
+ }
+
+ if ((region[0] == 0) || (region[1] == 0) || (region[2] == 0)) {
+ return CL_INVALID_VALUE;
+ }
+
+ return CL_SUCCESS;
+}
+
+static cl_int
+check_image_origin(struct _cl_mem_image *image, const size_t *porigin, size_t *origin)
+{
+ if (porigin == NULL) {
+ return CL_INVALID_VALUE;
+ }
+
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ origin[0] = porigin[0];
+ origin[1] = 0;
+ origin[2] = porigin[1];
+ } else {
+ origin[0] = porigin[0];
+ origin[1] = porigin[1];
+ origin[2] = porigin[2];
+ }
+
+ return CL_SUCCESS;
+}
+
+void *
+clEnqueueMapImage(cl_command_queue command_queue,
+ cl_mem mem,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ const size_t *porigin,
+ const size_t *pregion,
+ size_t *image_row_pitch,
+ size_t *image_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event,
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ void *ptr = NULL;
+ void *mem_ptr = NULL;
+ size_t offset = 0;
+ struct _cl_mem_image *image = NULL;
+ cl_int e_status;
+ enqueue_data *data = NULL;
+ size_t region[3];
+ size_t origin[3];
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_IMAGE(mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ image = cl_mem_image(mem);
+
+ err = check_image_region(image, pregion, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = check_image_origin(image, porigin, origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (command_queue->ctx != mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (origin[0] + region[0] > image->w ||
+ origin[1] + region[1] > image->h ||
+ origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if ((map_flags & CL_MAP_READ &&
+ mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+ (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+ mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_MAP_IMAGE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_map) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueMapImage;
+ data->mem_obj = mem;
+ data->origin[0] = origin[0];
+ data->origin[1] = origin[1];
+ data->origin[2] = origin[2];
+ data->region[0] = region[0];
+ data->region[1] = region[1];
+ data->region[2] = region[2];
+ data->ptr = ptr;
+ data->unsync_map = 1;
+ if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
+ data->write_map = 1;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+
+ ptr = data->ptr;
+ assert(ptr);
+
+ /* Store and write back map info. */
+ if (mem->flags & CL_MEM_USE_HOST_PTR) {
+ if (image_slice_pitch)
+ *image_slice_pitch = image->host_slice_pitch;
+ *image_row_pitch = image->host_row_pitch;
+
+ offset = image->bpp * origin[0] + image->host_row_pitch * origin[1] +
+ image->host_slice_pitch * origin[2];
+ } else {
+ if (image_slice_pitch)
+ *image_slice_pitch = image->slice_pitch;
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ *image_row_pitch = image->slice_pitch;
+ else
+ *image_row_pitch = image->row_pitch;
+
+ offset = image->bpp * origin[0] + image->row_pitch * origin[1] + image->slice_pitch * origin[2];
+ }
+
+ err = cl_mem_record_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
+ assert(err == CL_SUCCESS); // Easy way, do not use unmap to handle error.
+ } while (0);
+
+ if (err != CL_SUCCESS) {
+ if (e) {
+ cl_event_delete(e);
+ e = NULL;
+ }
+
+ assert(ptr == NULL);
+ }
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ if (errcode_ret)
+ *errcode_ret = err;
+
+ return mem_ptr;
+}
+
+cl_int
+clEnqueueReadImage(cl_command_queue command_queue,
+ cl_mem mem,
+ cl_bool blocking_read,
+ const size_t *porigin,
+ const size_t *pregion,
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ struct _cl_mem_image *image = NULL;
+ enqueue_data *data = NULL;
+ cl_int e_status;
+ size_t region[3];
+ size_t origin[3];
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_IMAGE(mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ image = cl_mem_image(mem);
+
+ err = check_image_region(image, pregion, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = check_image_origin(image, porigin, origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (command_queue->ctx != mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (origin[0] + region[0] > image->w ||
+ origin[1] + region[1] > image->h ||
+ origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (!row_pitch) {
+ row_pitch = image->bpp * region[0];
+ } else if (row_pitch < image->bpp * region[0]) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (image->slice_pitch) {
+ if (!slice_pitch) {
+ slice_pitch = row_pitch * region[1];
+ } else if (slice_pitch < row_pitch * region[1]) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ } else if (slice_pitch) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (!ptr) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_READ_IMAGE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_read) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueReadImage;
+ data->mem_obj = mem;
+ data->ptr = ptr;
+ data->origin[0] = origin[0];
+ data->origin[1] = origin[1];
+ data->origin[2] = origin[2];
+ data->region[0] = region[0];
+ data->region[1] = region[1];
+ data->region[2] = region[2];
+ data->row_pitch = row_pitch;
+ data->slice_pitch = slice_pitch;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueWriteImage(cl_command_queue command_queue,
+ cl_mem mem,
+ cl_bool blocking_write,
+ const size_t *porigin,
+ const size_t *pregion,
+ size_t row_pitch,
+ size_t slice_pitch,
+ const void *ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ struct _cl_mem_image *image = NULL;
+ enqueue_data *data = NULL;
+ cl_int e_status;
+ size_t region[3];
+ size_t origin[3];
+ cl_event e = NULL;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_IMAGE(mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ image = cl_mem_image(mem);
+
+ err = check_image_region(image, pregion, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = check_image_origin(image, porigin, origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (command_queue->ctx != mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (origin[0] + region[0] > image->w ||
+ origin[1] + region[1] > image->h ||
+ origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (!row_pitch) {
+ row_pitch = image->bpp * region[0];
+ } else if (row_pitch < image->bpp * region[0]) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (image->slice_pitch) {
+ if (!slice_pitch) {
+ slice_pitch = row_pitch * region[1];
+ } else if (slice_pitch < row_pitch * region[1]) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+ } else if (slice_pitch) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (!ptr) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_WRITE_IMAGE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (blocking_write) {
+ err = cl_event_wait_for_event_ready(e);
+ if (err != CL_SUCCESS)
+ break;
+
+ /* Blocking call API is a sync point of flush. */
+ err = cl_command_queue_wait_flush(command_queue);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ }
+
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ data = &e->exec_data;
+ data->type = EnqueueWriteImage;
+ data->mem_obj = mem;
+ data->const_ptr = ptr;
+ data->origin[0] = origin[0];
+ data->origin[1] = origin[1];
+ data->origin[2] = origin[2];
+ data->region[0] = region[0];
+ data->region[1] = region[1];
+ data->region[2] = region[2];
+ data->row_pitch = row_pitch;
+ data->slice_pitch = slice_pitch;
+
+ if (e_status == CL_COMPLETE) {
+ // Sync mode, no need to queue event.
+ err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ } else {
+ err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ cl_command_queue_enqueue_event(command_queue, e);
+ }
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueCopyImage(cl_command_queue command_queue,
+ cl_mem src_mem,
+ cl_mem dst_mem,
+ const size_t *psrc_origin,
+ const size_t *pdst_origin,
+ const size_t *pregion,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ cl_bool overlap = CL_TRUE;
+ cl_int i = 0;
+ cl_event e = NULL;
+ struct _cl_mem_image *src_image = NULL;
+ struct _cl_mem_image *dst_image = NULL;
+ size_t region[3];
+ size_t src_origin[3];
+ size_t dst_origin[3];
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_IMAGE(src_mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (!CL_OBJECT_IS_IMAGE(dst_mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ src_image = cl_mem_image(src_mem);
+ dst_image = cl_mem_image(dst_mem);
+
+ err = check_image_region(src_image, pregion, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = check_image_origin(src_image, psrc_origin, src_origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ err = check_image_origin(dst_image, pdst_origin, dst_origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (command_queue->ctx != src_mem->ctx ||
+ command_queue->ctx != dst_mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
+ src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
+ err = CL_IMAGE_FORMAT_MISMATCH;
+ break;
+ }
+
+ if (src_origin[0] + region[0] > src_image->w ||
+ src_origin[1] + region[1] > src_image->h ||
+ src_origin[2] + region[2] > src_image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (dst_origin[0] + region[0] > dst_image->w ||
+ dst_origin[1] + region[1] > dst_image->h ||
+ dst_origin[2] + region[2] > dst_image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
+ (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (src_image == dst_image) {
+ for (i = 0; i < 3; i++) {
+ overlap = overlap && (src_origin[i] < dst_origin[i] + region[i]) &&
+ (dst_origin[i] < src_origin[i] + region[i]);
+ }
+ if (overlap == CL_TRUE) {
+ err = CL_MEM_COPY_OVERLAP;
+ break;
+ }
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_COPY_IMAGE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_mem_kernel_copy_image(command_queue, e, src_image, dst_image,
+ src_origin, dst_origin, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+ cl_mem src_mem,
+ cl_mem dst_buffer,
+ const size_t *psrc_origin,
+ const size_t *pregion,
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ struct _cl_mem_image *src_image = NULL;
+ size_t region[3];
+ size_t src_origin[3];
+ cl_event e = NULL;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_IMAGE(src_mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (!CL_OBJECT_IS_BUFFER(dst_buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ src_image = cl_mem_image(src_mem);
+
+ err = check_image_region(src_image, pregion, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = check_image_origin(src_image, psrc_origin, src_origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (command_queue->ctx != src_mem->ctx ||
+ command_queue->ctx != dst_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (dst_offset + region[0] * region[1] * region[2] * src_image->bpp > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (src_origin[0] + region[0] > src_image->w ||
+ src_origin[1] + region[1] > src_image->h ||
+ src_origin[2] + region[2] > src_image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_COPY_IMAGE_TO_BUFFER, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_mem_copy_image_to_buffer(command_queue, e, src_image, dst_buffer,
+ src_origin, dst_offset, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_mem,
+ size_t src_offset,
+ const size_t *pdst_origin,
+ const size_t *pregion,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ struct _cl_mem_image *dst_image = NULL;
+ size_t region[3];
+ size_t dst_origin[3];
+ cl_event e = NULL;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_BUFFER(src_buffer)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+ if (!CL_OBJECT_IS_IMAGE(dst_mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ dst_image = cl_mem_image(dst_mem);
+
+ err = check_image_region(dst_image, pregion, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = check_image_origin(dst_image, pdst_origin, dst_origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (command_queue->ctx != src_buffer->ctx ||
+ command_queue->ctx != dst_mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (src_offset + region[0] * region[1] * region[2] * dst_image->bpp > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (dst_origin[0] + region[0] > dst_image->w ||
+ dst_origin[1] + region[1] > dst_image->h ||
+ dst_origin[2] + region[2] > dst_image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_COPY_BUFFER_TO_IMAGE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_mem_copy_buffer_to_image(command_queue, e, src_buffer, dst_image,
+ src_offset, dst_origin, region);
+
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clEnqueueFillImage(cl_command_queue command_queue,
+ cl_mem mem,
+ const void *fill_color,
+ const size_t *porigin,
+ const size_t *pregion,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ size_t region[3];
+ size_t origin[3];
+ cl_event e = NULL;
+ struct _cl_mem_image *image = NULL;
+ cl_int e_status;
+
+ do {
+ if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+ err = CL_INVALID_COMMAND_QUEUE;
+ break;
+ }
+
+ if (!CL_OBJECT_IS_IMAGE(mem)) {
+ err = CL_INVALID_MEM_OBJECT;
+ break;
+ }
+
+ image = cl_mem_image(mem);
+
+ err = check_image_region(image, pregion, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = check_image_origin(image, porigin, origin);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ if (command_queue->ctx != mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (fill_color == NULL) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (origin[0] + region[0] > image->w ||
+ origin[1] + region[1] > image->h ||
+ origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 || origin[1] != 0 ||
+ region[2] != 1 || region[1] != 1)) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+ event, command_queue->ctx);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+ event_wait_list, CL_COMMAND_FILL_IMAGE, &err);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ err = cl_image_fill(command_queue, e, fill_color, image, origin, region);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+ The finish or Complete status will always be done in queue list. */
+ e_status = cl_event_is_ready(e);
+ if (e_status < CL_COMPLETE) { // Error happend, cancel.
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ break;
+ }
+
+ err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+ if (err != CL_SUCCESS) {
+ break;
+ }
+
+ cl_command_queue_enqueue_event(command_queue, e);
+ } while (0);
+
+ if (err == CL_SUCCESS && event) {
+ *event = e;
+ } else {
+ cl_event_delete(e);
+ }
+
+ return err;
+}
+
+cl_int
+clRetainMemObject(cl_mem memobj)
+{
+ if (!CL_OBJECT_IS_MEM(memobj)) {
+ return CL_INVALID_MEM_OBJECT;
+ }
+
+ cl_mem_add_ref(memobj);
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseMemObject(cl_mem memobj)
+{
+ if (!CL_OBJECT_IS_MEM(memobj)) {
+ return CL_INVALID_MEM_OBJECT;
+ }
+
+ cl_mem_delete(memobj);
+ return CL_SUCCESS;
+}
diff --git a/src/cl_api_platform_id.c b/src/cl_api_platform_id.c
new file mode 100644
index 0000000..10d8894
--- /dev/null
+++ b/src/cl_api_platform_id.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_platform_id.h"
+#include "CL/cl_ext.h"
+
+cl_int
+clGetPlatformInfo(cl_platform_id platform,
+ cl_platform_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+
+ if (!CL_OBJECT_IS_PLATFORM(platform)) {
+ return CL_INVALID_PLATFORM;
+ }
+
+ /* Only one platform now. */
+ if (platform != cl_get_platform_default()) {
+ return CL_INVALID_PLATFORM;
+ }
+
+ if (param_name == CL_PLATFORM_PROFILE) {
+ src_ptr = platform->profile;
+ src_size = platform->profile_sz;
+ } else if (param_name == CL_PLATFORM_VERSION) {
+ src_ptr = platform->version;
+ src_size = platform->version_sz;
+ } else if (param_name == CL_PLATFORM_NAME) {
+ src_ptr = platform->name;
+ src_size = platform->name_sz;
+ } else if (param_name == CL_PLATFORM_VENDOR) {
+ src_ptr = platform->vendor;
+ src_size = platform->vendor_sz;
+ } else if (param_name == CL_PLATFORM_EXTENSIONS) {
+ src_ptr = platform->extensions;
+ src_size = platform->extensions_sz;
+ } else if (param_name == CL_PLATFORM_ICD_SUFFIX_KHR) {
+ src_ptr = platform->icd_suffix_khr;
+ src_size = platform->icd_suffix_khr_sz;
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
diff --git a/src/cl_api_program.c b/src/cl_api_program.c
new file mode 100644
index 0000000..d68f29f
--- /dev/null
+++ b/src/cl_api_program.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_device_id.h"
+#include <string.h>
+
+cl_int
+clGetProgramInfo(cl_program program,
+ cl_program_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ const char *ret_str = "";
+ cl_int ref;
+ cl_uint num_dev, kernels_num;
+
+ if (!CL_OBJECT_IS_PROGRAM(program)) {
+ return CL_INVALID_PROGRAM;
+ }
+
+ if (param_name == CL_PROGRAM_REFERENCE_COUNT) {
+ ref = CL_OBJECT_GET_REF(program);
+ src_ptr = &ref;
+ src_size = sizeof(cl_int);
+ } else if (param_name == CL_PROGRAM_CONTEXT) {
+ src_ptr = &program->ctx;
+ src_size = sizeof(cl_context);
+ } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
+ num_dev = program->ctx->device_num; // Just 1 dev now.
+ src_ptr = &num_dev;
+ src_size = sizeof(cl_uint);
+ } else if (param_name == CL_PROGRAM_DEVICES) {
+ src_ptr = program->ctx->devices;
+ src_size = program->ctx->device_num * sizeof(cl_device_id);
+ } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
+ kernels_num = program->ker_n;
+ src_ptr = &kernels_num;
+ src_size = sizeof(cl_uint);
+ } else if (param_name == CL_PROGRAM_SOURCE) {
+ if (!program->source) {
+ src_ptr = ret_str;
+ src_size = 1;
+ } else {
+ src_ptr = program->source;
+ src_size = strlen(program->source) + 1;
+ }
+ } else if (param_name == CL_PROGRAM_KERNEL_NAMES) {
+ // TODO: need to refine this.
+ cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
+ return CL_SUCCESS;
+ } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
+ if (program->binary == NULL) {
+ if (program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+ } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+ } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+ } else {
+ return CL_INVALID_BINARY;
+ }
+ }
+
+ if (program->binary == NULL || program->binary_sz == 0) {
+ return CL_OUT_OF_RESOURCES;
+ }
+ src_ptr = &program->binary_sz;
+ src_size = sizeof(size_t);
+ } else if (param_name == CL_PROGRAM_BINARIES) {
+ if (param_value_size_ret)
+ *param_value_size_ret = sizeof(void *);
+ if (!param_value)
+ return CL_SUCCESS;
+
+ /* param_value points to an array of n
+ pointers allocated by the caller */
+ if (program->binary == NULL) {
+ if (program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+ } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+ } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+ } else {
+ return CL_INVALID_BINARY;
+ }
+ }
+
+ if (program->binary == NULL || program->binary_sz == 0) {
+ return CL_OUT_OF_RESOURCES;
+ }
+
+ memcpy(*((void **)param_value), program->binary, program->binary_sz);
+ return CL_SUCCESS;
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
+
+cl_int
+clGetProgramBuildInfo(cl_program program,
+ cl_device_id device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ const char *ret_str = "";
+ size_t global_size;
+
+ if (!CL_OBJECT_IS_PROGRAM(program)) {
+ return CL_INVALID_PROGRAM;
+ }
+
+ cl_int err = cl_devices_list_include_check(program->ctx->device_num,
+ program->ctx->devices, 1, &device);
+ if (err != CL_SUCCESS)
+ return err;
+
+ if (param_name == CL_PROGRAM_BUILD_STATUS) {
+ src_ptr = &program->build_status;
+ src_size = sizeof(cl_build_status);
+ } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
+ if (program->is_built && program->build_opts) {
+ ret_str = program->build_opts;
+ }
+ src_ptr = ret_str;
+ src_size = strlen(ret_str) + 1;
+ } else if (param_name == CL_PROGRAM_BUILD_LOG) {
+ src_ptr = program->build_log;
+ src_size = program->build_log_sz + 1;
+ } else if (param_name == CL_PROGRAM_BINARY_TYPE) {
+ src_ptr = &program->binary_type;
+ src_size = sizeof(cl_uint);
+ } else if (param_name == CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE) {
+ global_size = 0;
+ if (program->is_built)
+ global_size = cl_program_get_global_variable_size(program);
+ src_ptr = &global_size;
+ src_size = sizeof(global_size);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
diff --git a/src/cl_api_sampler.c b/src/cl_api_sampler.c
new file mode 100644
index 0000000..d8dba29
--- /dev/null
+++ b/src/cl_api_sampler.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_sampler.h"
+#include "cl_context.h"
+#include "cl_device_id.h"
+
+cl_sampler
+clCreateSampler(cl_context context,
+ cl_bool normalized,
+ cl_addressing_mode addressing,
+ cl_filter_mode filter,
+ cl_int *errcode_ret)
+{
+ cl_sampler sampler = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_uint i;
+
+ do {
+ if (!CL_OBJECT_IS_CONTEXT(context)) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+
+ if (addressing < CL_ADDRESS_NONE || addressing > CL_ADDRESS_MIRRORED_REPEAT) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ if (filter < CL_FILTER_NEAREST || filter > CL_FILTER_LINEAR) {
+ err = CL_INVALID_VALUE;
+ break;
+ }
+
+ /* Check if images are not supported by any device associated with context */
+ for (i = 0; i < context->device_num; i++) {
+ if (context->devices[i]->image_support == CL_FALSE) {
+ err = CL_INVALID_OPERATION;
+ break;
+ }
+ }
+ if (err != CL_SUCCESS)
+ break;
+
+ sampler = cl_create_sampler(context, normalized, addressing, filter, &err);
+ } while (0);
+
+ if (errcode_ret)
+ *errcode_ret = err;
+ return sampler;
+}
+
+cl_int
+clGetSamplerInfo(cl_sampler sampler,
+ cl_sampler_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ cl_int ref;
+
+ if (!CL_OBJECT_IS_SAMPLER(sampler)) {
+ return CL_INVALID_SAMPLER;
+ }
+
+ if (param_name == CL_SAMPLER_REFERENCE_COUNT) {
+ ref = CL_OBJECT_GET_REF(sampler);
+ src_ptr = &ref;
+ src_size = sizeof(cl_int);
+ } else if (param_name == CL_SAMPLER_CONTEXT) {
+ src_ptr = &sampler->ctx;
+ src_size = sizeof(cl_context);
+ } else if (param_name == CL_SAMPLER_NORMALIZED_COORDS) {
+ src_ptr = &sampler->normalized_coords;
+ src_size = sizeof(cl_bool);
+ } else if (param_name == CL_SAMPLER_ADDRESSING_MODE) {
+ src_ptr = &sampler->address;
+ src_size = sizeof(cl_addressing_mode);
+ } else if (param_name == CL_SAMPLER_FILTER_MODE) {
+ src_ptr = &sampler->filter;
+ src_size = sizeof(cl_filter_mode);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
+}
+
+cl_int
+clRetainSampler(cl_sampler sampler)
+{
+ if (!CL_OBJECT_IS_SAMPLER(sampler)) {
+ return CL_INVALID_SAMPLER;
+ }
+
+ cl_sampler_add_ref(sampler);
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseSampler(cl_sampler sampler)
+{
+ if (!CL_OBJECT_IS_SAMPLER(sampler)) {
+ return CL_INVALID_SAMPLER;
+ }
+
+ cl_sampler_delete(sampler);
+ return CL_SUCCESS;
+}
diff --git a/src/cl_base_object.c b/src/cl_base_object.c
new file mode 100644
index 0000000..5578bdc
--- /dev/null
+++ b/src/cl_base_object.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <stdio.h>
+#include "cl_base_object.h"
+
+static pthread_t invalid_thread_id = -1;
+
+LOCAL void
+cl_object_init_base(cl_base_object obj, cl_ulong magic)
+{
+ obj->magic = magic;
+ obj->ref = 1;
+ SET_ICD(obj->dispatch);
+ pthread_mutex_init(&obj->mutex, NULL);
+ pthread_cond_init(&obj->cond, NULL);
+ obj->owner = invalid_thread_id;
+ list_node_init(&obj->node);
+}
+
+LOCAL void
+cl_object_destroy_base(cl_base_object obj)
+{
+ int ref = CL_OBJECT_GET_REF(obj);
+ if (ref != 0) {
+ DEBUGP(DL_ERROR, "CL object %p, call destroy with a reference %d", obj,
+ ref);
+ assert(0);
+ }
+
+ if (!CL_OBJECT_IS_VALID(obj)) {
+ DEBUGP(DL_ERROR,
+ "CL object %p, call destroy while it is already a dead object", obj);
+ assert(0);
+ }
+
+ if (obj->owner != invalid_thread_id) {
+ DEBUGP(DL_ERROR, "CL object %p, call destroy while still has a owener %d",
+ obj, (int)obj->owner);
+ assert(0);
+ }
+
+ if (!list_node_out_of_list(&obj->node)) {
+ DEBUGP(DL_ERROR, "CL object %p, call destroy while still belong to some object %p",
+ obj, obj->node.p);
+ assert(0);
+ }
+
+ obj->magic = CL_OBJECT_INVALID_MAGIC;
+ pthread_mutex_destroy(&obj->mutex);
+ pthread_cond_destroy(&obj->cond);
+}
+
+LOCAL cl_int
+cl_object_take_ownership(cl_base_object obj, cl_int wait, cl_bool withlock)
+{
+ pthread_t self;
+
+ assert(CL_OBJECT_IS_VALID(obj));
+
+ self = pthread_self();
+
+ if (withlock == CL_FALSE)
+ pthread_mutex_lock(&obj->mutex);
+
+ if (pthread_equal(obj->owner, self)) { // Already get
+ if (withlock == CL_FALSE)
+ pthread_mutex_unlock(&obj->mutex);
+ return 1;
+ }
+
+ if (pthread_equal(obj->owner, invalid_thread_id)) {
+ obj->owner = self;
+
+ if (withlock == CL_FALSE)
+ pthread_mutex_unlock(&obj->mutex);
+ return 1;
+ }
+
+ if (wait == 0) {
+ if (withlock == CL_FALSE)
+ pthread_mutex_unlock(&obj->mutex);
+ return 0;
+ }
+
+ while (!pthread_equal(obj->owner, invalid_thread_id)) {
+ pthread_cond_wait(&obj->cond, &obj->mutex);
+ }
+
+ obj->owner = self;
+
+ if (withlock == CL_FALSE)
+ pthread_mutex_unlock(&obj->mutex);
+
+ return 1;
+}
+
+LOCAL void
+cl_object_release_ownership(cl_base_object obj, cl_bool withlock)
+{
+ assert(CL_OBJECT_IS_VALID(obj));
+
+ if (withlock == CL_FALSE)
+ pthread_mutex_lock(&obj->mutex);
+
+ assert(pthread_equal(pthread_self(), obj->owner));
+ obj->owner = invalid_thread_id;
+ pthread_cond_broadcast(&obj->cond);
+
+ if (withlock == CL_FALSE)
+ pthread_mutex_unlock(&obj->mutex);
+}
+
+LOCAL void
+cl_object_wait_on_cond(cl_base_object obj)
+{
+ assert(CL_OBJECT_IS_VALID(obj));
+ pthread_cond_wait(&obj->cond, &obj->mutex);
+}
+
+LOCAL void
+cl_object_notify_cond(cl_base_object obj)
+{
+ assert(CL_OBJECT_IS_VALID(obj));
+ pthread_cond_broadcast(&obj->cond);
+}
diff --git a/src/cl_base_object.h b/src/cl_base_object.h
new file mode 100644
index 0000000..186b149
--- /dev/null
+++ b/src/cl_base_object.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_BASE_OBJECT_H__
+#define __CL_BASE_OBJECT_H__
+
+#include "cl_utils.h"
+#include "cl_khr_icd.h"
+#include "CL/cl.h"
+#include <pthread.h>
+#include <assert.h>
+
+/************************************************************************
+ Every CL objects should have:
+ ICD dispatcher: Hold the ICD function table pointer.
+
+ Reference: To maintain its' life time. CL retain/release API will
+ change its value. We will destroy the object when the count reach 0.
+
+ Magic: Just a number to represent each CL object. We will use it
+ to check whether it is the object we want.
+
+ Mutex & Cond: Used to protect the CL objects MT safe. lock/unlock
+ critical region should be short enough and should not have any block
+ function call. take_ownership/release_ownership can own the object
+ for a long time. take_ownership will not hold the lock and so will
+ not cause deadlock problems. we can wait on the cond to get the
+ ownership.
+*************************************************************************/
+
+typedef struct _cl_base_object {
+ DEFINE_ICD(dispatch); /* Dispatch function table for icd */
+ cl_ulong magic; /* Magic number for each CL object */
+ atomic_t ref; /* Reference for each CL object */
+ list_node node; /* CL object node belong to some container */
+ pthread_mutex_t mutex; /* THe mutex to protect this object MT safe */
+ pthread_cond_t cond; /* Condition to wait for getting the object */
+ pthread_t owner; /* The thread which own this object */
+} _cl_base_object;
+
+typedef struct _cl_base_object *cl_base_object;
+
+#define CL_OBJECT_INVALID_MAGIC 0xFEFEFEFEFEFEFEFELL
+#define CL_OBJECT_IS_VALID(obj) (((cl_base_object)obj)->magic != CL_OBJECT_INVALID_MAGIC)
+
+#define CL_OBJECT_INC_REF(obj) (atomic_inc(&((cl_base_object)obj)->ref))
+#define CL_OBJECT_DEC_REF(obj) (atomic_dec(&((cl_base_object)obj)->ref))
+#define CL_OBJECT_GET_REF(obj) (atomic_read(&((cl_base_object)obj)->ref))
+
+#define CL_OBJECT_LOCK(obj) (pthread_mutex_lock(&((cl_base_object)obj)->mutex))
+#define CL_OBJECT_UNLOCK(obj) (pthread_mutex_unlock(&((cl_base_object)obj)->mutex))
+
+extern void cl_object_init_base(cl_base_object obj, cl_ulong magic);
+extern void cl_object_destroy_base(cl_base_object obj);
+extern cl_int cl_object_take_ownership(cl_base_object obj, cl_int wait, cl_bool withlock);
+extern void cl_object_release_ownership(cl_base_object obj, cl_bool withlock);
+extern void cl_object_wait_on_cond(cl_base_object obj);
+extern void cl_object_notify_cond(cl_base_object obj);
+
+#define CL_OBJECT_INIT_BASE(obj, magic) (cl_object_init_base((cl_base_object)obj, magic))
+#define CL_OBJECT_DESTROY_BASE(obj) (cl_object_destroy_base((cl_base_object)obj))
+#define CL_OBJECT_TAKE_OWNERSHIP(obj, wait) (cl_object_take_ownership((cl_base_object)obj, wait, CL_FALSE))
+#define CL_OBJECT_RELEASE_OWNERSHIP(obj) (cl_object_release_ownership((cl_base_object)obj, CL_FALSE))
+#define CL_OBJECT_TAKE_OWNERSHIP_WITHLOCK(obj, wait) (cl_object_take_ownership((cl_base_object)obj, wait, CL_TRUE))
+#define CL_OBJECT_RELEASE_OWNERSHIP_WITHLOCK(obj) (cl_object_release_ownership((cl_base_object)obj, CL_TRUE))
+#define CL_OBJECT_WAIT_ON_COND(obj) (cl_object_wait_on_cond((cl_base_object)obj))
+#define CL_OBJECT_NOTIFY_COND(obj) (cl_object_notify_cond((cl_base_object)obj))
+
+#endif /* __CL_BASE_OBJECT_H__ */
diff --git a/src/cl_cmrt.cpp b/src/cl_cmrt.cpp
index 25e4d82..f653844 100644
--- a/src/cl_cmrt.cpp
+++ b/src/cl_cmrt.cpp
@@ -256,7 +256,7 @@ cl_int cmrt_set_kernel_arg(cl_kernel k, cl_uint index, size_t sz, const void *va
result = cmrt_kernel->SetKernelArg(index, sz, value);
else {
cl_mem mem = *(cl_mem*)value;
- if (mem->magic == CL_MAGIC_MEM_HEADER) {
+ if (((cl_base_object)mem)->magic == CL_MAGIC_MEM_HEADER) {
if (!CreateCmrtMemory(mem))
return CL_INVALID_ARG_VALUE;
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index b66928f..55b1a23 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -25,7 +25,6 @@
#include "cl_device_id.h"
#include "cl_mem.h"
#include "cl_utils.h"
-#include "cl_thread.h"
#include "cl_alloc.h"
#include "cl_driver.h"
#include "cl_khr_icd.h"
@@ -37,80 +36,71 @@
#include <stdio.h>
#include <string.h>
-LOCAL cl_command_queue
+static cl_command_queue
cl_command_queue_new(cl_context ctx)
{
cl_command_queue queue = NULL;
assert(ctx);
- TRY_ALLOC_NO_ERR (queue, CALLOC(struct _cl_command_queue));
- SET_ICD(queue->dispatch)
- queue->magic = CL_MAGIC_QUEUE_HEADER;
- queue->ref_n = 1;
- queue->ctx = ctx;
- queue->cmrt_event = NULL;
- if ((queue->thread_data = cl_thread_data_create()) == NULL) {
- goto error;
+ queue = cl_calloc(1, sizeof(_cl_command_queue));
+ if (queue == NULL)
+ return NULL;
+
+ CL_OBJECT_INIT_BASE(queue, CL_OBJECT_COMMAND_QUEUE_MAGIC);
+ if (cl_command_queue_init_enqueue(queue) != CL_SUCCESS) {
+ cl_free(queue);
+ return NULL;
}
/* Append the command queue in the list */
- pthread_mutex_lock(&ctx->queue_lock);
- queue->next = ctx->queues;
- if (ctx->queues != NULL)
- ctx->queues->prev = queue;
- ctx->queues = queue;
- pthread_mutex_unlock(&ctx->queue_lock);
+ cl_context_add_queue(ctx, queue);
+ return queue;
+}
- /* The queue also belongs to its context */
- cl_context_add_ref(ctx);
+LOCAL cl_command_queue
+cl_create_command_queue(cl_context ctx, cl_device_id device, cl_command_queue_properties properties,
+ cl_uint queue_size, cl_int *errcode_ret)
+{
+ cl_command_queue queue = cl_command_queue_new(ctx);
+ if (queue == NULL) {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return NULL;
+ }
-exit:
+ queue->props = properties;
+ queue->device = device;
+ queue->size = queue_size;
+
+ *errcode_ret = CL_SUCCESS;
return queue;
-error:
- cl_command_queue_delete(queue);
- queue = NULL;
- goto exit;
}
LOCAL void
cl_command_queue_delete(cl_command_queue queue)
{
assert(queue);
- if (atomic_dec(&queue->ref_n) != 1) return;
-
-#ifdef HAS_CMRT
- if (queue->cmrt_event != NULL)
- cmrt_destroy_event(queue);
-#endif
-
- // If there is a list of valid events, we need to give them
- // a chance to call the call-back function.
- cl_event_update_last_events(queue,1);
- /* Remove it from the list */
- assert(queue->ctx);
- pthread_mutex_lock(&queue->ctx->queue_lock);
- if (queue->prev)
- queue->prev->next = queue->next;
- if (queue->next)
- queue->next->prev = queue->prev;
- if (queue->ctx->queues == queue)
- queue->ctx->queues = queue->next;
- pthread_mutex_unlock(&queue->ctx->queue_lock);
-
- cl_thread_data_destroy(queue);
- queue->thread_data = NULL;
+ if (CL_OBJECT_DEC_REF(queue) > 1)
+ return;
+
+ /* Before we destroy the queue, we should make sure all
+ the commands in the queue are finished. */
+ cl_command_queue_wait_finish(queue);
+ cl_context_remove_queue(queue->ctx, queue);
+
+ cl_command_queue_destroy_enqueue(queue);
+
cl_mem_delete(queue->perf);
- cl_context_delete(queue->ctx);
- cl_free(queue->wait_events);
- cl_free(queue->barrier_events);
- queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ if (queue->barrier_events) {
+ cl_free(queue->barrier_events);
+ }
+ CL_OBJECT_DESTROY_BASE(queue);
cl_free(queue);
}
LOCAL void
cl_command_queue_add_ref(cl_command_queue queue)
{
- atomic_inc(&queue->ref_n);
+ CL_OBJECT_INC_REF(queue);
}
static void
@@ -131,10 +121,9 @@ set_image_info(char *curbe,
}
LOCAL cl_int
-cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
+cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
{
uint32_t i;
- GET_QUEUE_THREAD_GPGPU(queue);
for (i = 0; i < k->image_sz; i++) {
int id = k->images[i].arg_idx;
@@ -143,6 +132,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
image = cl_mem_image(k->args[id].mem);
set_image_info(k->curbe, &k->images[i], image);
+ if(*max_bti < k->images[i].idx)
+ *max_bti = k->images[i].idx;
if(k->vme){
if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) )
return CL_IMAGE_FORMAT_NOT_SUPPORTED;
@@ -168,33 +159,71 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
}
LOCAL cl_int
-cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
+cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
{
- GET_QUEUE_THREAD_GPGPU(queue);
-
/* Bind all user buffers (given by clSetKernelArg) */
- uint32_t i;
+ uint32_t i, bti;
+ uint32_t ocl_version = interp_kernel_get_ocl_version(k->opaque);
enum gbe_arg_type arg_type; /* kind of argument */
for (i = 0; i < k->arg_n; ++i) {
int32_t offset; // location of the address in the curbe
arg_type = interp_kernel_get_arg_type(k->opaque, i);
- if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
+ if (!(arg_type == GBE_ARG_GLOBAL_PTR ||
+ (arg_type == GBE_ARG_CONSTANT_PTR && ocl_version >= 200) ||
+ arg_type == GBE_ARG_PIPE) ||
+ !k->args[i].mem)
continue;
offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
if (offset < 0)
continue;
+ bti = interp_kernel_get_arg_bti(k->opaque, i);
+ if(*max_bti < bti)
+ *max_bti = bti;
if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
- cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, bti);
} else {
- cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+ size_t mem_offset = 0; //
+ if(k->args[i].is_svm) {
+ mem_offset = (size_t)k->args[i].ptr - (size_t)k->args[i].mem->host_ptr;
+ }
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + mem_offset, k->args[i].mem->size, bti);
}
}
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
+{
+ uint32_t i;
+ size_t mem_offset, bti = *max_bti;
+ cl_mem mem;
+ int32_t offset = interp_kernel_get_curbe_size(k->opaque);
+
+ for (i = 0; i < k->exec_info_n; i++) {
+ void *ptr = k->exec_info[i];
+ mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr);
+ if(mem == NULL)
+ mem = cl_context_get_mem_from_ptr(k->program->ctx, ptr);
+
+ if (mem) {
+ mem_offset = (size_t)ptr - (size_t)mem->host_ptr;
+ /* only need realloc in surface state, don't need realloc in curbe */
+ cl_gpgpu_bind_buf(gpgpu, mem->bo, offset + i * sizeof(ptr), mem->offset + mem_offset, mem->size, bti++);
+ if(bti == BTI_WORKAROUND_IMAGE_OFFSET)
+ bti = *max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
+ assert(bti < BTI_MAX_ID);
+ }
+ }
+ *max_bti = bti;
return CL_SUCCESS;
}
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_event,
+ uint32_t, const size_t *, const size_t *,const size_t *,
+ const size_t *, const size_t *, const size_t *);
static cl_int
cl_kernel_check_args(cl_kernel k)
@@ -209,10 +238,14 @@ cl_kernel_check_args(cl_kernel k)
LOCAL cl_int
cl_command_queue_ND_range(cl_command_queue queue,
cl_kernel k,
+ cl_event event,
const uint32_t work_dim,
const size_t *global_wk_off,
+ const size_t *global_dim_off,
const size_t *global_wk_sz,
- const size_t *local_wk_sz)
+ const size_t *global_wk_sz_use,
+ const size_t *local_wk_sz,
+ const size_t *local_wk_sz_use)
{
if(b_output_kernel_perf)
time_start(queue->ctx, cl_kernel_get_name(k), queue);
@@ -222,8 +255,13 @@ cl_command_queue_ND_range(cl_command_queue queue,
/* Check that the user did not forget any argument */
TRY (cl_kernel_check_args, k);
+
if (ver == 7 || ver == 75 || ver == 8 || ver == 9)
- TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+ //TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+ TRY (cl_command_queue_ND_range_gen7, queue, k, event, work_dim,
+ global_wk_off, global_dim_off, global_wk_sz,
+ global_wk_sz_use, local_wk_sz, local_wk_sz_use);
+
else
FATAL ("Unknown Gen Device");
@@ -232,7 +270,7 @@ error:
}
LOCAL int
-cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
+cl_command_queue_flush_gpgpu(cl_gpgpu gpgpu)
{
void* printf_info = cl_gpgpu_get_printf_info(gpgpu);
void* profiling_info;
@@ -257,171 +295,73 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
interp_output_profiling(profiling_info, cl_gpgpu_map_profiling_buffer(gpgpu));
cl_gpgpu_unmap_profiling_buffer(gpgpu);
}
- return CL_SUCCESS;
-}
-
-LOCAL cl_int
-cl_command_queue_flush(cl_command_queue queue)
-{
- int err;
- GET_QUEUE_THREAD_GPGPU(queue);
- err = cl_command_queue_flush_gpgpu(queue, gpgpu);
- // We now keep a list of uncompleted events and check if they compelte
- // every flush. This can make sure all events created have chance to be
- // update status, so the callback functions or reference can be handled.
- cl_event_update_last_events(queue,0);
-
- cl_event current_event = get_current_event(queue);
- if (current_event && err == CL_SUCCESS) {
- err = cl_event_flush(current_event);
- set_current_event(queue, NULL);
- }
- cl_invalid_thread_gpgpu(queue);
- return err;
-}
-LOCAL cl_int
-cl_command_queue_finish(cl_command_queue queue)
-{
- cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
- cl_event_update_last_events(queue,1);
return CL_SUCCESS;
}
-#define DEFAULT_WAIT_EVENTS_SIZE 16
-LOCAL void
-cl_command_queue_insert_event(cl_command_queue queue, cl_event event)
-{
- cl_int i=0;
- cl_event *new_list;
-
- assert(queue != NULL);
- if(queue->wait_events == NULL) {
- queue->wait_events_size = DEFAULT_WAIT_EVENTS_SIZE;
- TRY_ALLOC_NO_ERR (queue->wait_events, CALLOC_ARRAY(cl_event, queue->wait_events_size));
- }
-
- for(i=0; i<queue->wait_events_num; i++) {
- if(queue->wait_events[i] == event)
- return; //is in the wait_events, need to insert
- }
-
- if(queue->wait_events_num < queue->wait_events_size) {
- queue->wait_events[queue->wait_events_num++] = event;
- return;
- }
-
- //wait_events_num == wait_events_size, array is full
- queue->wait_events_size *= 2;
- TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->wait_events_size));
- memcpy(new_list, queue->wait_events, sizeof(cl_event)*queue->wait_events_num);
- cl_free(queue->wait_events);
- queue->wait_events = new_list;
- queue->wait_events[queue->wait_events_num++] = event;
- return;
-
-exit:
- return;
-error:
- if(queue->wait_events)
- cl_free(queue->wait_events);
- queue->wait_events = NULL;
- queue->wait_events_size = 0;
- queue->wait_events_num = 0;
- goto exit;
-
-}
-
-LOCAL void
-cl_command_queue_remove_event(cl_command_queue queue, cl_event event)
-{
- cl_int i=0;
-
- assert(queue->wait_events);
- for(i=0; i<queue->wait_events_num; i++) {
- if(queue->wait_events[i] == event)
- break;
- }
-
- if(i == queue->wait_events_num)
- return;
-
- if(i == queue->wait_events_num - 1) {
- queue->wait_events[i] = NULL;
- } else {
- for(; i<queue->wait_events_num-1; i++) {
- queue->wait_events[i] = queue->wait_events[i+1];
- }
- }
- queue->wait_events_num -= 1;
-}
-
-#define DEFAULT_WAIT_EVENTS_SIZE 16
LOCAL void
cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event)
{
- cl_int i=0;
- cl_event *new_list;
+ cl_int i = 0;
+
+ cl_event_add_ref(event);
assert(queue != NULL);
- if(queue->barrier_events == NULL) {
- queue->barrier_events_size = DEFAULT_WAIT_EVENTS_SIZE;
- TRY_ALLOC_NO_ERR (queue->barrier_events, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+ CL_OBJECT_LOCK(queue);
+
+ if (queue->barrier_events == NULL) {
+ queue->barrier_events_size = 4;
+ queue->barrier_events = cl_calloc(queue->barrier_events_size, sizeof(cl_event));
+ assert(queue->barrier_events);
}
- for(i=0; i<queue->barrier_events_num; i++) {
- if(queue->barrier_events[i] == event)
- return; //is in the barrier_events, need to insert
+ for (i = 0; i<queue->barrier_events_num; i++) {
+ assert(queue->barrier_events[i] != event);
}
if(queue->barrier_events_num < queue->barrier_events_size) {
queue->barrier_events[queue->barrier_events_num++] = event;
+ CL_OBJECT_UNLOCK(queue);
return;
}
- //barrier_events_num == barrier_events_size, array is full
+ /* Array is full, double expand. */
queue->barrier_events_size *= 2;
- TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
- memcpy(new_list, queue->barrier_events, sizeof(cl_event)*queue->barrier_events_num);
- cl_free(queue->barrier_events);
- queue->barrier_events = new_list;
- queue->barrier_events[queue->barrier_events_num++] = event;
- return;
+ queue->barrier_events = cl_realloc(queue->barrier_events,
+ queue->barrier_events_size * sizeof(cl_event));
+ assert(queue->barrier_events);
-exit:
+ queue->barrier_events[queue->barrier_events_num++] = event;
+ CL_OBJECT_UNLOCK(queue);
return;
-error:
- if(queue->barrier_events)
- cl_free(queue->barrier_events);
- queue->barrier_events = NULL;
- queue->barrier_events_size = 0;
- queue->barrier_events_num = 0;
- goto exit;
-
}
LOCAL void
cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event)
{
- cl_int i=0;
+ cl_int i = 0;
+ assert(queue != NULL);
- if(queue->barrier_events_num == 0)
- return;
+ CL_OBJECT_LOCK(queue);
+
+ assert(queue->barrier_events_num > 0);
+ assert(queue->barrier_events);
- for(i=0; i<queue->barrier_events_num; i++) {
+ for(i = 0; i < queue->barrier_events_num; i++) {
if(queue->barrier_events[i] == event)
break;
}
+ assert(i < queue->barrier_events_num); // Must find it.
- if(i == queue->barrier_events_num)
- return;
-
- if(i == queue->barrier_events_num - 1) {
+ if(i == queue->barrier_events_num - 1) { // The last one.
queue->barrier_events[i] = NULL;
} else {
- for(; i<queue->barrier_events_num-1; i++) {
+ for(; i < queue->barrier_events_num - 1; i++) { // Move forward.
queue->barrier_events[i] = queue->barrier_events[i+1];
}
}
queue->barrier_events_num -= 1;
+ CL_OBJECT_UNLOCK(queue);
+
+ cl_event_delete(event);
}
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index d1b8c44..9f6ff39 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -22,84 +22,87 @@
#include "cl_internals.h"
#include "cl_driver.h"
-#include "cl_thread.h"
+#include "cl_base_object.h"
#include "CL/cl.h"
#include <stdint.h>
struct intel_gpgpu;
+typedef struct _cl_command_queue_enqueue_worker {
+ cl_command_queue queue;
+ pthread_t tid;
+ cl_uint cookie;
+ cl_bool quit;
+ list_head enqueued_events;
+ cl_uint in_exec_status; // Same value as CL_COMPLETE, CL_SUBMITTED ...
+} _cl_command_queue_enqueue_worker;
+
+typedef _cl_command_queue_enqueue_worker *cl_command_queue_enqueue_worker;
+
/* Basically, this is a (kind-of) batch buffer */
-struct _cl_command_queue {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a command queue */
- volatile int ref_n; /* We reference count this object */
+typedef struct _cl_command_queue {
+ _cl_base_object base;
+ _cl_command_queue_enqueue_worker worker;
cl_context ctx; /* Its parent context */
- cl_event* barrier_events; /* Point to array of non-complete user events that block this command queue */
- cl_int barrier_events_num; /* Number of Non-complete user events */
- cl_int barrier_events_size; /* The size of array that wait_events point to */
- cl_event* wait_events; /* Point to array of non-complete user events that block this command queue */
- cl_int wait_events_num; /* Number of Non-complete user events */
- cl_int wait_events_size; /* The size of array that wait_events point to */
- cl_command_queue_properties props; /* Queue properties */
- cl_command_queue prev, next; /* We chain the command queues together */
- void *thread_data; /* Used to store thread context data */
+ cl_device_id device; /* Its device */
+ cl_event* barrier_events; /* Point to array of non-complete user events that block this command queue */
+ cl_int barrier_events_num; /* Number of Non-complete user events */
+ cl_int barrier_events_size; /* The size of array that wait_events point to */
+ cl_command_queue_properties props; /* Queue properties */
cl_mem perf; /* Where to put the perf counters */
+ cl_uint size; /* Store the specified size for queueu */
+} _cl_command_queue;;
- void* cmrt_event; /* the latest CmEvent* of the command queue */
-};
-
-/* The macro to get the thread specified gpgpu struct. */
-#define GET_QUEUE_THREAD_GPGPU(queue) \
- cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL; \
- if (queue) \
- assert(gpgpu);
+#define CL_OBJECT_COMMAND_QUEUE_MAGIC 0x83650a12b79ce4efLL
+#define CL_OBJECT_IS_COMMAND_QUEUE(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_COMMAND_QUEUE_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
/* Allocate and initialize a new command queue. Also insert it in the list of
- * command queue in the associated context
- */
-extern cl_command_queue cl_command_queue_new(cl_context);
-
+ * command queue in the associated context */
+extern cl_command_queue cl_create_command_queue(cl_context, cl_device_id,
+ cl_command_queue_properties, cl_uint, cl_int*);
/* Destroy and deallocate the command queue */
extern void cl_command_queue_delete(cl_command_queue);
-
/* Keep one more reference on the queue */
extern void cl_command_queue_add_ref(cl_command_queue);
-
/* Map ND range kernel from OCL API */
extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
cl_kernel ker,
+ cl_event event,
const uint32_t work_dim,
- const size_t *global_work_offset,
- const size_t *global_work_size,
- const size_t *local_work_size);
+ const size_t *global_wk_off,
+ const size_t *global_dim_off,
+ const size_t *global_wk_sz,
+ const size_t *global_wk_sz_use,
+ const size_t *local_wk_sz,
+ const size_t *local_wk_sz_use);
/* The memory object where to report the performance */
extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
-
-/* Flush for the command queue */
-extern cl_int cl_command_queue_flush(cl_command_queue);
-
/* Flush for the specified gpgpu */
-extern int cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
-
-/* Wait for the completion of the command queue */
-extern cl_int cl_command_queue_finish(cl_command_queue);
-
+extern int cl_command_queue_flush_gpgpu(cl_gpgpu);
/* Bind all the surfaces in the GPGPU state */
-extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
-
+extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
/* Bind all the image surfaces in the GPGPU state */
-extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
+extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
+/* Bind all exec info to bind table */
+extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
/* Insert a user event to command's wait_events */
extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
-
/* Remove a user event from command's wait_events */
extern void cl_command_queue_remove_event(cl_command_queue, cl_event);
-
extern void cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event);
-
extern void cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event);
+extern void cl_command_queue_notify(cl_command_queue queue);
+extern void cl_command_queue_enqueue_event(cl_command_queue queue, cl_event event);
+extern cl_int cl_command_queue_init_enqueue(cl_command_queue queue);
+extern void cl_command_queue_destroy_enqueue(cl_command_queue queue);
+extern cl_int cl_command_queue_wait_finish(cl_command_queue queue);
+extern cl_int cl_command_queue_wait_flush(cl_command_queue queue);
+/* Note: Must call this function with queue's lock. */
+extern cl_event *cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num);
#endif /* __CL_COMMAND_QUEUE_H__ */
diff --git a/src/cl_command_queue_enqueue.c b/src/cl_command_queue_enqueue.c
new file mode 100644
index 0000000..44a0761
--- /dev/null
+++ b/src/cl_command_queue_enqueue.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: He Junyan <junyan.he at intel.com>
+ */
+
+#include "cl_command_queue.h"
+#include "cl_event.h"
+#include "cl_alloc.h"
+#include <stdio.h>
+
+static void *
+worker_thread_function(void *Arg)
+{
+ cl_command_queue_enqueue_worker worker = (cl_command_queue_enqueue_worker)Arg;
+ cl_command_queue queue = worker->queue;
+ cl_event e;
+ cl_uint cookie = -1;
+ list_node *pos;
+ list_node *n;
+ list_head ready_list;
+ cl_int exec_status;
+
+ CL_OBJECT_LOCK(queue);
+
+ while (1) {
+ /* Must have locked here. */
+
+ if (worker->quit == CL_TRUE) {
+ CL_OBJECT_UNLOCK(queue);
+ return NULL;
+ }
+
+ if (list_empty(&worker->enqueued_events)) {
+ CL_OBJECT_WAIT_ON_COND(queue);
+ continue;
+ }
+
+ /* The cookie will change when event status change or something happend to
+ this command queue. If we already checked the event list and do not find
+ anything to exec, we need to wait the cookie update, to avoid loop for ever. */
+ if (cookie == worker->cookie) {
+ CL_OBJECT_WAIT_ON_COND(queue);
+ continue;
+ }
+
+ /* Here we hold lock to check event status, to avoid missing the status notify*/
+ list_init(&ready_list);
+ list_for_each_safe(pos, n, &worker->enqueued_events)
+ {
+ e = list_entry(pos, _cl_event, enqueue_node);
+ if (cl_event_is_ready(e) <= CL_COMPLETE) {
+ list_node_del(&e->enqueue_node);
+ list_add_tail(&ready_list, &e->enqueue_node);
+ }
+ }
+
+ if (list_empty(&ready_list)) { /* Nothing to do, just wait. */
+ cookie = worker->cookie;
+ continue;
+ }
+
+ /* Notify waiters, we change the event list. */
+ CL_OBJECT_NOTIFY_COND(queue);
+
+ worker->in_exec_status = CL_QUEUED;
+ CL_OBJECT_UNLOCK(queue);
+
+ /* Do the really job without lock.*/
+ exec_status = CL_SUBMITTED;
+ list_for_each_safe(pos, n, &ready_list)
+ {
+ e = list_entry(pos, _cl_event, enqueue_node);
+ cl_event_exec(e, exec_status, CL_FALSE);
+ }
+
+ /* Notify all waiting for flush. */
+ CL_OBJECT_LOCK(queue);
+ worker->in_exec_status = CL_SUBMITTED;
+ CL_OBJECT_NOTIFY_COND(queue);
+ CL_OBJECT_UNLOCK(queue);
+
+ list_for_each_safe(pos, n, &ready_list)
+ {
+ e = list_entry(pos, _cl_event, enqueue_node);
+ cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+ }
+
+ /* Clear and delete all the events. */
+ list_for_each_safe(pos, n, &ready_list)
+ {
+ e = list_entry(pos, _cl_event, enqueue_node);
+ list_node_del(&e->enqueue_node);
+ cl_event_delete(e);
+ }
+
+ CL_OBJECT_LOCK(queue);
+ worker->in_exec_status = CL_COMPLETE;
+
+ /* Notify finish waiters, we have done all the ready event. */
+ CL_OBJECT_NOTIFY_COND(queue);
+ }
+}
+
+LOCAL void
+cl_command_queue_notify(cl_command_queue queue)
+{
+ if (CL_OBJECT_GET_REF(queue) < 1) {
+ return;
+ }
+
+ assert(queue && (((cl_base_object)queue)->magic == CL_OBJECT_COMMAND_QUEUE_MAGIC));
+ CL_OBJECT_LOCK(queue);
+ queue->worker.cookie++;
+ CL_OBJECT_NOTIFY_COND(queue);
+ CL_OBJECT_UNLOCK(queue);
+}
+
+LOCAL void
+cl_command_queue_enqueue_event(cl_command_queue queue, cl_event event)
+{
+ CL_OBJECT_INC_REF(event);
+ assert(CL_OBJECT_IS_COMMAND_QUEUE(queue));
+ CL_OBJECT_LOCK(queue);
+ assert(queue->worker.quit == CL_FALSE);
+ assert(list_node_out_of_list(&event->enqueue_node));
+ list_add_tail(&queue->worker.enqueued_events, &event->enqueue_node);
+ queue->worker.cookie++;
+ CL_OBJECT_NOTIFY_COND(queue);
+ CL_OBJECT_UNLOCK(queue);
+}
+
+LOCAL cl_int
+cl_command_queue_init_enqueue(cl_command_queue queue)
+{
+ cl_command_queue_enqueue_worker worker = &queue->worker;
+ worker->queue = queue;
+ worker->quit = CL_FALSE;
+ worker->in_exec_status = CL_COMPLETE;
+ worker->cookie = 8;
+ list_init(&worker->enqueued_events);
+
+ if (pthread_create(&worker->tid, NULL, worker_thread_function, worker)) {
+ DEBUGP(DL_ERROR, "Can not create worker thread for queue %p...\n", queue);
+ return CL_OUT_OF_RESOURCES;
+ }
+
+ return CL_SUCCESS;
+}
+
+LOCAL void
+cl_command_queue_destroy_enqueue(cl_command_queue queue)
+{
+ cl_command_queue_enqueue_worker worker = &queue->worker;
+ list_node *pos;
+ list_node *n;
+ cl_event e;
+
+ assert(worker->queue == queue);
+ assert(worker->quit == CL_FALSE);
+
+ CL_OBJECT_LOCK(queue);
+ worker->quit = 1;
+ CL_OBJECT_NOTIFY_COND(queue);
+ CL_OBJECT_UNLOCK(queue);
+
+ pthread_join(worker->tid, NULL);
+
+ /* We will wait for finish before destroy the command queue. */
+ if (!list_empty(&worker->enqueued_events)) {
+ DEBUGP(DL_WARNING, "There are still some enqueued works in the queue %p when this"
+ " queue is destroyed, this may cause very serious problems.\n",
+ queue);
+
+ list_for_each_safe(pos, n, &worker->enqueued_events)
+ {
+ e = list_entry(pos, _cl_event, enqueue_node);
+ list_node_del(&e->enqueue_node);
+ cl_event_set_status(e, -1); // Give waiters a chance to wakeup.
+ cl_event_delete(e);
+ }
+ }
+}
+
+/* Note: Must call this function with queue's lock. */
+LOCAL cl_event *
+cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num)
+{
+ int event_num = 0;
+ list_node *pos;
+ cl_command_queue_enqueue_worker worker = &queue->worker;
+ cl_event *enqueued_list = NULL;
+ int i;
+ cl_event tmp_e = NULL;
+
+ list_for_each(pos, &worker->enqueued_events)
+ {
+ event_num++;
+ }
+ assert(event_num > 0);
+
+ enqueued_list = cl_calloc(event_num, sizeof(cl_event));
+ assert(enqueued_list);
+
+ i = 0;
+ list_for_each(pos, &worker->enqueued_events)
+ {
+ tmp_e = list_entry(pos, _cl_event, enqueue_node);
+ cl_event_add_ref(tmp_e); // Add ref temp avoid delete.
+ enqueued_list[i] = tmp_e;
+ i++;
+ }
+ assert(i == event_num);
+
+ *list_num = event_num;
+ return enqueued_list;
+}
+
+LOCAL cl_int
+cl_command_queue_wait_flush(cl_command_queue queue)
+{
+ cl_command_queue_enqueue_worker worker = &queue->worker;
+ cl_event *enqueued_list = NULL;
+ cl_uint enqueued_num = 0;
+ int i;
+
+ CL_OBJECT_LOCK(queue);
+
+ if (worker->quit) { // already destroy the queue?
+ CL_OBJECT_UNLOCK(queue);
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ if (!list_empty(&worker->enqueued_events)) {
+ enqueued_list = cl_command_queue_record_in_queue_events(queue, &enqueued_num);
+ assert(enqueued_num > 0);
+ assert(enqueued_list);
+ }
+
+ while (worker->in_exec_status == CL_QUEUED) {
+ CL_OBJECT_WAIT_ON_COND(queue);
+
+ if (worker->quit) { // already destroy the queue?
+ CL_OBJECT_UNLOCK(queue);
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+ }
+
+ CL_OBJECT_UNLOCK(queue);
+
+ /* Wait all event enter submitted status. */
+ for (i = 0; i < enqueued_num; i++) {
+ CL_OBJECT_LOCK(enqueued_list[i]);
+ while (enqueued_list[i]->status > CL_SUBMITTED) {
+ CL_OBJECT_WAIT_ON_COND(enqueued_list[i]);
+ }
+ CL_OBJECT_UNLOCK(enqueued_list[i]);
+ }
+
+ for (i = 0; i < enqueued_num; i++) {
+ cl_event_delete(enqueued_list[i]);
+ }
+ if (enqueued_list)
+ cl_free(enqueued_list);
+
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_wait_finish(cl_command_queue queue)
+{
+ cl_command_queue_enqueue_worker worker = &queue->worker;
+ cl_event *enqueued_list = NULL;
+ cl_uint enqueued_num = 0;
+ int i;
+
+ CL_OBJECT_LOCK(queue);
+
+ if (worker->quit) { // already destroy the queue?
+ CL_OBJECT_UNLOCK(queue);
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ if (!list_empty(&worker->enqueued_events)) {
+ enqueued_list = cl_command_queue_record_in_queue_events(queue, &enqueued_num);
+ assert(enqueued_num > 0);
+ assert(enqueued_list);
+ }
+
+ while (worker->in_exec_status > CL_COMPLETE) {
+ CL_OBJECT_WAIT_ON_COND(queue);
+
+ if (worker->quit) { // already destroy the queue?
+ CL_OBJECT_UNLOCK(queue);
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+ }
+
+ CL_OBJECT_UNLOCK(queue);
+
+ /* Wait all event enter submitted status. */
+ for (i = 0; i < enqueued_num; i++) {
+ CL_OBJECT_LOCK(enqueued_list[i]);
+ while (enqueued_list[i]->status > CL_COMPLETE) {
+ CL_OBJECT_WAIT_ON_COND(enqueued_list[i]);
+ }
+ CL_OBJECT_UNLOCK(enqueued_list[i]);
+ }
+
+ for (i = 0; i < enqueued_num; i++) {
+ cl_event_delete(enqueued_list[i]);
+ }
+ if (enqueued_list)
+ cl_free(enqueued_list);
+
+ return CL_SUCCESS;
+}
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index a7b967d..dd82a44 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -23,12 +23,15 @@
#include "cl_kernel.h"
#include "cl_device_id.h"
#include "cl_mem.h"
+#include "cl_event.h"
#include "cl_utils.h"
#include "cl_alloc.h"
+#include "cl_device_enqueue.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
+#include <unistd.h>
#define MAX_GROUP_SIZE_IN_HALFSLICE 512
static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+256; }
@@ -123,12 +126,24 @@ error:
}
static int
-cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker, cl_gpgpu gpgpu)
{
- /* calculate constant buffer size
- * we need raw_size & aligned_size
- */
- GET_QUEUE_THREAD_GPGPU(queue);
+ if (interp_kernel_get_ocl_version(ker->opaque) >= 200) {
+ // pass the starting of constant address space
+ int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0);
+ if (constant_addrspace >= 0) {
+ size_t global_const_size = interp_program_get_global_constant_size(ker->program->opaque);
+ if (global_const_size > 0) {
+ *(char **)(ker->curbe + constant_addrspace) = ker->program->global_data_ptr;
+ cl_gpgpu_bind_buf(gpgpu, ker->program->global_data, constant_addrspace, 0, ALIGN(global_const_size, getpagesize()), BTI_CONSTANT);
+ }
+ }
+ return 0;
+ }
+ // TODO this is only valid for OpenCL 1.2,
+ // under ocl1.2 we gather all constant into one dedicated surface.
+ // but in 2.0 we put program global into one surface, but constants
+ // pass through kernel argument in each separate buffer
int32_t arg;
size_t offset = 0;
uint32_t raw_size = 0, aligned_size =0;
@@ -207,6 +222,7 @@ cl_curbe_fill(cl_kernel ker,
const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz,
+ const size_t *enqueued_local_wk_sz,
size_t thread_n)
{
int32_t offset;
@@ -216,15 +232,18 @@ cl_curbe_fill(cl_kernel ker,
UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
+ UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, enqueued_local_wk_sz[0]);
+ UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, enqueued_local_wk_sz[1]);
+ UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, enqueued_local_wk_sz[2]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
- UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]);
- UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
- UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
+ UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0] / enqueued_local_wk_sz[0] + (global_wk_sz[0]%enqueued_local_wk_sz[0]?1:0));
+ UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1] / enqueued_local_wk_sz[1] + (global_wk_sz[1]%enqueued_local_wk_sz[1]?1:0));
+ UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2] / enqueued_local_wk_sz[2] + (global_wk_sz[2]%enqueued_local_wk_sz[2]?1:0));
UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
#undef UPLOAD
@@ -255,11 +274,11 @@ static void
cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
{
cl_context ctx = ker->program->ctx;
- cl_device_id device = ctx->device;
+ cl_device_id device = ctx->devices[0];
const int32_t per_lane_stack_sz = ker->stack_size;
const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
const int32_t sub_value = GBE_STACK_BUFFER;
- const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+ const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
int32_t stack_sz = per_lane_stack_sz;
/* No stack required for this kernel */
@@ -269,9 +288,9 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
/* The stack size is given for *each* SIMD lane. So, we accordingly compute
* the size we need for the complete machine
*/
- assert(offset >= 0);
+ assert(offset_stack_buffer >= 0);
stack_sz *= interp_kernel_get_simd_width(ker->opaque);
- stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
+ stack_sz *= device->max_compute_unit * ctx->devices[0]->max_thread_per_unit;
/* for some hardware, part of EUs are disabled with EU id reserved,
* it makes the active EU id larger than count of EUs within a subslice,
@@ -279,7 +298,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
*/
cl_driver_enlarge_stack_size(ctx->drv, &stack_sz);
- cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
+ const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0);
+ if (offset_stack_size >= 0) {
+ *(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz;
+ }
+
+ cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE);
}
static int
@@ -331,24 +355,36 @@ cl_alloc_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num
LOCAL cl_int
cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_kernel ker,
+ cl_event event,
const uint32_t work_dim,
const size_t *global_wk_off,
+ const size_t *global_dim_off,
const size_t *global_wk_sz,
- const size_t *local_wk_sz)
+ const size_t *global_wk_sz_use,
+ const size_t *local_wk_sz,
+ const size_t *local_wk_sz_use)
{
- GET_QUEUE_THREAD_GPGPU(queue);
+ cl_gpgpu gpgpu = cl_gpgpu_new(queue->ctx->drv);
cl_context ctx = queue->ctx;
char *final_curbe = NULL; /* Includes them and one sub-buffer per group */
cl_gpgpu_kernel kernel;
const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
size_t i, batch_sz = 0u, local_sz = 0u;
- size_t cst_sz = ker->curbe_sz= interp_kernel_get_curbe_size(ker->opaque);
+ size_t cst_sz = interp_kernel_get_curbe_size(ker->opaque);
int32_t scratch_sz = interp_kernel_get_scratch_size(ker->opaque);
size_t thread_n = 0u;
int printf_num = 0;
cl_int err = CL_SUCCESS;
size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
void* printf_info = NULL;
+ uint32_t max_bti = 0;
+
+ if (ker->exec_info_n > 0) {
+ cst_sz += ker->exec_info_n * sizeof(void *);
+ cst_sz = (cst_sz + 31) / 32 * 32; //align to register size, hard code here.
+ ker->curbe = cl_realloc(ker->curbe, cst_sz);
+ }
+ ker->curbe_sz = cst_sz;
/* Setup kernel */
kernel.name = interp_kernel_get_name(ker->opaque);
@@ -359,21 +395,21 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
kernel.use_slm = interp_kernel_use_slm(ker->opaque);
/* Compute the number of HW threads we need */
- if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
+ if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz_use, 3, &local_sz) != CL_SUCCESS)) {
DEBUGP(DL_ERROR, "Work group size exceed Kernel's work group size.");
return err;
}
kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
kernel.curbe_sz = cst_sz;
- if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
+ if (scratch_sz > ker->program->ctx->devices[0]->scratch_mem_size) {
DEBUGP(DL_ERROR, "Out of scratch memory %d.", scratch_sz);
return CL_OUT_OF_RESOURCES;
}
/* Curbe step 1: fill the constant urb buffer data shared by all threads */
if (ker->curbe) {
- kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
- if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
+ kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz_use ,local_wk_sz, thread_n);
+ if (kernel.slm_sz > ker->program->ctx->devices[0]->local_mem_size) {
DEBUGP(DL_ERROR, "Out of shared local memory %d.", kernel.slm_sz);
return CL_OUT_OF_RESOURCES;
}
@@ -384,9 +420,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Setup the kernel */
if (queue->props & CL_QUEUE_PROFILING_ENABLE)
- err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
+ err = cl_gpgpu_state_init(gpgpu, ctx->devices[0]->max_compute_unit * ctx->devices[0]->max_thread_per_unit, cst_sz / 32, 1);
else
- err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 0);
+ err = cl_gpgpu_state_init(gpgpu, ctx->devices[0]->max_compute_unit * ctx->devices[0]->max_thread_per_unit, cst_sz / 32, 0);
if (err != 0)
goto error;
printf_num = interp_get_printf_num(printf_info);
@@ -403,10 +439,14 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
}
/* Bind user buffers */
- cl_command_queue_bind_surface(queue, ker);
+ cl_command_queue_bind_surface(queue, ker, gpgpu, &max_bti);
/* Bind user images */
- if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker) != CL_SUCCESS))
+ if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker, gpgpu, &max_bti) != CL_SUCCESS))
return err;
+ /* Bind all exec infos */
+ cl_command_queue_bind_exec_info(queue, ker, gpgpu, &max_bti);
+ /* Bind device enqueue buffer */
+ cl_device_enqueue_bind_buffer(gpgpu, ker, &max_bti, &kernel);
/* Bind all samplers */
if (ker->vme)
cl_gpgpu_bind_vme_state(gpgpu, ker->accel);
@@ -419,7 +459,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Bind a stack if needed */
cl_bind_stack(gpgpu, ker);
- if (cl_upload_constant_buffer(queue, ker) != 0)
+ if (cl_upload_constant_buffer(queue, ker, gpgpu) != 0)
goto error;
cl_gpgpu_states_setup(gpgpu, &kernel);
@@ -431,7 +471,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
for (i = 0; i < thread_n; ++i) {
memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
}
- TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
+ TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz_use, simd_sz, cst_sz, thread_n);
if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
goto error;
}
@@ -440,14 +480,19 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
batch_sz = cl_kernel_compute_batch_sz(ker);
if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0)
goto error;
- cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
+ //cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
cl_gpgpu_batch_start(gpgpu);
/* Issue the GPGPU_WALKER command */
- cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+ cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off,global_dim_off, global_wk_sz_use, local_wk_sz_use);
/* Close the batch buffer and submit it */
cl_gpgpu_batch_end(gpgpu, 0);
+
+ event->exec_data.queue = queue;
+ event->exec_data.gpgpu = gpgpu;
+ event->exec_data.type = EnqueueNDRangeKernel;
+
return CL_SUCCESS;
error:
diff --git a/src/cl_context.c b/src/cl_context.c
index a6bde7d..3f2e757 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -22,6 +22,8 @@
#include "cl_context.h"
#include "cl_command_queue.h"
#include "cl_mem.h"
+#include "cl_sampler.h"
+#include "cl_event.h"
#include "cl_alloc.h"
#include "cl_utils.h"
#include "cl_driver.h"
@@ -38,6 +40,139 @@
#include <assert.h>
#include <string.h>
+LOCAL void
+cl_context_add_queue(cl_context ctx, cl_command_queue queue) {
+ assert(queue->ctx == NULL);
+ cl_context_add_ref(ctx);
+
+ CL_OBJECT_LOCK(ctx);
+ while (ctx->queue_modify_disable) {
+ CL_OBJECT_WAIT_ON_COND(ctx);
+ }
+ list_add_tail(&ctx->queues, &queue->base.node);
+ ctx->queue_num++;
+ CL_OBJECT_UNLOCK(ctx);
+
+ queue->ctx = ctx;
+}
+
+LOCAL void
+cl_context_remove_queue(cl_context ctx, cl_command_queue queue) {
+ assert(queue->ctx == ctx);
+
+ CL_OBJECT_LOCK(ctx);
+ while (ctx->queue_modify_disable) {
+ CL_OBJECT_WAIT_ON_COND(ctx);
+ }
+ list_node_del(&queue->base.node);
+ ctx->queue_num--;
+ CL_OBJECT_UNLOCK(ctx);
+
+ cl_context_delete(ctx);
+ queue->ctx = NULL;
+}
+
+LOCAL void
+cl_context_add_mem(cl_context ctx, cl_mem mem) {
+ assert(mem->ctx == NULL);
+ cl_context_add_ref(ctx);
+
+ CL_OBJECT_LOCK(ctx);
+ list_add_tail(&ctx->mem_objects, &mem->base.node);
+ ctx->mem_object_num++;
+ CL_OBJECT_UNLOCK(ctx);
+
+ mem->ctx = ctx;
+}
+
+LOCAL void
+cl_context_remove_mem(cl_context ctx, cl_mem mem) {
+ assert(mem->ctx == ctx);
+ CL_OBJECT_LOCK(ctx);
+ list_node_del(&mem->base.node);
+ ctx->mem_object_num--;
+ CL_OBJECT_UNLOCK(ctx);
+
+ cl_context_delete(ctx);
+ mem->ctx = NULL;
+}
+
+LOCAL void
+cl_context_add_sampler(cl_context ctx, cl_sampler sampler) {
+ assert(sampler->ctx == NULL);
+ cl_context_add_ref(ctx);
+
+ CL_OBJECT_LOCK(ctx);
+ list_add_tail(&ctx->samplers, &sampler->base.node);
+ ctx->sampler_num++;
+ CL_OBJECT_UNLOCK(ctx);
+
+ sampler->ctx = ctx;
+}
+
+LOCAL void
+cl_context_remove_sampler(cl_context ctx, cl_sampler sampler) {
+ assert(sampler->ctx == ctx);
+ CL_OBJECT_LOCK(ctx);
+ list_node_del(&sampler->base.node);
+ ctx->sampler_num--;
+ CL_OBJECT_UNLOCK(ctx);
+
+ cl_context_delete(ctx);
+ sampler->ctx = NULL;
+}
+
+LOCAL void
+cl_context_add_event(cl_context ctx, cl_event event) {
+ assert(event->ctx == NULL);
+ cl_context_add_ref(ctx);
+
+ CL_OBJECT_LOCK(ctx);
+ list_add_tail(&ctx->events, &event->base.node);
+ ctx->event_num++;
+ CL_OBJECT_UNLOCK(ctx);
+
+ event->ctx = ctx;
+}
+
+LOCAL void
+cl_context_remove_event(cl_context ctx, cl_event event) {
+ assert(event->ctx == ctx);
+ CL_OBJECT_LOCK(ctx);
+ list_node_del(&event->base.node);
+ ctx->event_num--;
+ CL_OBJECT_UNLOCK(ctx);
+
+ cl_context_delete(ctx);
+ event->ctx = NULL;
+}
+
+LOCAL void
+cl_context_add_program(cl_context ctx, cl_program program) {
+ assert(program->ctx == NULL);
+ cl_context_add_ref(ctx);
+
+ CL_OBJECT_LOCK(ctx);
+ list_add_tail(&ctx->programs, &program->base.node);
+ ctx->program_num++;
+ CL_OBJECT_UNLOCK(ctx);
+
+ program->ctx = ctx;
+}
+
+LOCAL void
+cl_context_remove_program(cl_context ctx, cl_program program) {
+ assert(program->ctx == ctx);
+ CL_OBJECT_LOCK(ctx);
+ list_node_del(&program->base.node);
+ ctx->program_num--;
+ CL_OBJECT_UNLOCK(ctx);
+
+ cl_context_delete(ctx);
+ program->ctx = NULL;
+}
+
+
#define CHECK(var) \
if (var) \
return CL_INVALID_PROPERTY; \
@@ -125,6 +260,10 @@ cl_create_context(const cl_context_properties * properties,
cl_context ctx = NULL;
cl_int err = CL_SUCCESS;
cl_uint prop_len = 0;
+ cl_uint dev_num = 0;
+ cl_device_id* all_dev = NULL;
+ cl_uint i, j;
+
/* XXX */
FATAL_IF (num_devices != 1, "Only one device is supported");
@@ -132,8 +271,32 @@ cl_create_context(const cl_context_properties * properties,
if (UNLIKELY(((err = cl_context_properties_process(properties, &props, &prop_len)) != CL_SUCCESS)))
goto error;
+ /* Filter out repeated device. */
+ assert(num_devices > 0);
+ all_dev = cl_calloc(num_devices, sizeof(cl_device_id));
+ if (all_dev == NULL) {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return NULL;
+ }
+ for (i = 0; i < num_devices; i++) {
+ for (j = 0; j < i; j++) {
+ if (devices[j] == devices[i]) {
+ break;
+ }
+ }
+
+ if (j != i) { // Find some duplicated one.
+ continue;
+ }
+
+ all_dev[dev_num] = devices[i];
+ dev_num++;
+ }
+ assert(dev_num == 1); // TODO: multi devices later.
+
/* We are good */
- if (UNLIKELY((ctx = cl_context_new(&props)) == NULL)) {
+ if (UNLIKELY((ctx = cl_context_new(&props, dev_num, all_dev)) == NULL)) {
+ cl_free(all_dev);
err = CL_OUT_OF_HOST_MEMORY;
goto error;
}
@@ -143,13 +306,13 @@ cl_create_context(const cl_context_properties * properties,
memcpy(ctx->prop_user, properties, sizeof(cl_context_properties)*prop_len);
}
ctx->prop_len = prop_len;
- /* Attach the device to the context */
- ctx->device = *devices;
+ /* cl_context_new will use all_dev. */
+ all_dev = NULL;
/* Save the user callback and user data*/
ctx->pfn_notify = pfn_notify;
ctx->user_data = user_data;
- cl_driver_set_atomic_flag(ctx->drv, ctx->device->atomic_test_result);
+ cl_driver_set_atomic_flag(ctx->drv, ctx->devices[0]->atomic_test_result);
exit:
if (errcode_ret != NULL)
@@ -162,22 +325,23 @@ error:
}
LOCAL cl_context
-cl_context_new(struct _cl_context_prop *props)
+cl_context_new(struct _cl_context_prop *props, cl_uint dev_num, cl_device_id* all_dev)
{
cl_context ctx = NULL;
TRY_ALLOC_NO_ERR (ctx, CALLOC(struct _cl_context));
+ CL_OBJECT_INIT_BASE(ctx, CL_OBJECT_CONTEXT_MAGIC);
+ ctx->devices = all_dev;
+ ctx->device_num = dev_num;
+ list_init(&ctx->queues);
+ list_init(&ctx->mem_objects);
+ list_init(&ctx->samplers);
+ list_init(&ctx->events);
+ list_init(&ctx->programs);
+ ctx->queue_modify_disable = CL_FALSE;
TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
- SET_ICD(ctx->dispatch)
ctx->props = *props;
- ctx->magic = CL_MAGIC_CONTEXT_HEADER;
- ctx->ref_n = 1;
ctx->ver = cl_driver_get_ver(ctx->drv);
- pthread_mutex_init(&ctx->program_lock, NULL);
- pthread_mutex_init(&ctx->queue_lock, NULL);
- pthread_mutex_init(&ctx->buffer_lock, NULL);
- pthread_mutex_init(&ctx->sampler_lock, NULL);
- pthread_mutex_init(&ctx->accelerator_intel_lock, NULL);
exit:
return ctx;
@@ -195,7 +359,7 @@ cl_context_delete(cl_context ctx)
return;
/* We are not done yet */
- if (atomic_dec(&ctx->ref_n) > 1)
+ if (CL_OBJECT_DEC_REF(ctx) > 1)
return;
/* delete the internal programs. */
@@ -218,16 +382,9 @@ cl_context_delete(cl_context ctx)
cl_program_delete(ctx->built_in_prgs);
ctx->built_in_prgs = NULL;
- /* All object lists should have been freed. Otherwise, the reference counter
- * of the context cannot be 0
- */
- assert(ctx->queues == NULL);
- assert(ctx->programs == NULL);
- assert(ctx->buffers == NULL);
- assert(ctx->drv);
cl_free(ctx->prop_user);
cl_driver_delete(ctx->drv);
- ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ CL_OBJECT_DESTROY_BASE(ctx);
cl_free(ctx);
}
@@ -235,32 +392,7 @@ LOCAL void
cl_context_add_ref(cl_context ctx)
{
assert(ctx);
- atomic_inc(&ctx->ref_n);
-}
-
-LOCAL cl_command_queue
-cl_context_create_queue(cl_context ctx,
- cl_device_id device,
- cl_command_queue_properties properties, /* XXX */
- cl_int *errcode_ret)
-{
- cl_command_queue queue = NULL;
- cl_int err = CL_SUCCESS;
-
-
-
- /* We create the command queue and store it in the context list of queues */
- TRY_ALLOC (queue, cl_command_queue_new(ctx));
- queue->props = properties;
-
-exit:
- if (errcode_ret)
- *errcode_ret = err;
- return queue;
-error:
- cl_command_queue_delete(queue);
- queue = NULL;
- goto exit;
+ CL_OBJECT_INC_REF(ctx);
}
cl_buffer_mgr
@@ -276,9 +408,10 @@ cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
cl_int ret;
cl_int binary_status = CL_SUCCESS;
cl_kernel ker;
- pthread_mutex_lock(&ctx->program_lock);
+
+ CL_OBJECT_TAKE_OWNERSHIP(ctx, 1);
if (ctx->internal_prgs[index] == NULL) {
- ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+ ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->devices[0],
&size, (const unsigned char **)&str_kernel, &binary_status, &ret);
if (!ctx->internal_prgs[index]) {
@@ -326,6 +459,41 @@ cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
ker = ctx->internal_kernels[index];
unlock:
- pthread_mutex_unlock(&ctx->program_lock);
+ CL_OBJECT_RELEASE_OWNERSHIP(ctx);
return cl_kernel_dup(ker);
}
+
+
+cl_mem
+cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
+{
+ struct list_node *pos;
+ cl_mem buf;
+
+ list_for_each (pos, (&ctx->mem_objects)) {
+ buf = (cl_mem)list_entry(pos, _cl_base_object, node);
+ if(buf->host_ptr == NULL) continue;
+ if(buf->is_svm == 0) continue;
+ if(buf->type != CL_MEM_SVM_TYPE) continue;
+ if((size_t)buf->host_ptr <= (size_t)p &&
+ (size_t)p < ((size_t)buf->host_ptr + buf->size))
+ return buf;
+ }
+ return NULL;
+}
+
+cl_mem
+cl_context_get_mem_from_ptr(cl_context ctx, const void * p)
+{
+ struct list_node *pos;
+ cl_mem buf;
+
+ list_for_each (pos, (&ctx->mem_objects)) {
+ buf = (cl_mem)list_entry(pos, _cl_base_object, node);
+ if(buf->host_ptr == NULL) continue;
+ if((size_t)buf->host_ptr <= (size_t)p &&
+ (size_t)p < ((size_t)buf->host_ptr + buf->size))
+ return buf;
+ }
+ return NULL;
+}
diff --git a/src/cl_context.h b/src/cl_context.h
index 489e5d7..4812afd 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -24,7 +24,7 @@
#include "CL/cl_ext.h"
#include "cl_internals.h"
#include "cl_driver.h"
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
#include <stdint.h>
#include <pthread.h>
@@ -99,23 +99,23 @@ struct _cl_context_prop {
#define EGL_CTX(ctx) (EGLContext)(ctx->props.gl_context)
/* Encapsulate the whole device */
struct _cl_context {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a context */
- volatile int ref_n; /* We reference count this object */
+ _cl_base_object base;
cl_driver drv; /* Handles HW or simulator */
- cl_device_id device; /* All information about the GPU device */
- cl_command_queue queues; /* All command queues currently allocated */
- cl_program programs; /* All programs currently allocated */
- cl_mem buffers; /* All memory object currently allocated */
- cl_sampler samplers; /* All sampler object currently allocated */
+ cl_device_id* devices; /* All devices belong to this context */
+ cl_uint device_num; /* Devices number of this context */
+ list_head queues; /* All command queues currently allocated */
+ cl_uint queue_num; /* All queue number currently allocated */
+ cl_uint queue_modify_disable; /* Temp disable queue list change. */
+ list_head mem_objects; /* All memory object currently allocated */
+ cl_uint mem_object_num; /* All memory number currently allocated */
+ list_head samplers; /* All sampler object currently allocated */
+ cl_uint sampler_num; /* All sampler number currently allocated */
+ list_head events; /* All event object currently allocated */
+ cl_uint event_num; /* All event number currently allocated */
+ list_head programs; /* All programs currently allocated */
+ cl_uint program_num; /* All program number currently allocated */
+
cl_accelerator_intel accels; /* All accelerator_intel object currently allocated */
- cl_event events; /* All event object currently allocated */
- pthread_mutex_t queue_lock; /* To allocate and deallocate queues */
- pthread_mutex_t program_lock; /* To allocate and deallocate programs */
- pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
- pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
- pthread_mutex_t accelerator_intel_lock; /* To allocate and deallocate accelerator_intel */
- pthread_mutex_t event_lock; /* To allocate and deallocate events */
cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
/* All programs internal used, for example clEnqueuexxx api use */
cl_kernel internal_kernels[CL_INTERNAL_KERNEL_MAX];
@@ -132,6 +132,22 @@ struct _cl_context {
};
+#define CL_OBJECT_CONTEXT_MAGIC 0x20BBCADE993134AALL
+#define CL_OBJECT_IS_CONTEXT(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_CONTEXT_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
+
+extern void cl_context_add_queue(cl_context ctx, cl_command_queue queue);
+extern void cl_context_remove_queue(cl_context ctx, cl_command_queue queue);
+extern void cl_context_add_mem(cl_context ctx, cl_mem mem);
+extern void cl_context_remove_mem(cl_context ctx, cl_mem mem);
+extern void cl_context_add_sampler(cl_context ctx, cl_sampler sampler);
+extern void cl_context_remove_sampler(cl_context ctx, cl_sampler sampler);
+extern void cl_context_add_event(cl_context ctx, cl_event sampler);
+extern void cl_context_remove_event(cl_context ctx, cl_event sampler);
+extern void cl_context_add_program(cl_context ctx, cl_program program);
+extern void cl_context_remove_program(cl_context ctx, cl_program program);
+
/* Implement OpenCL function */
extern cl_context cl_create_context(const cl_context_properties*,
cl_uint,
@@ -141,7 +157,7 @@ extern cl_context cl_create_context(const cl_context_properties*,
cl_int*);
/* Allocate and initialize a context */
-extern cl_context cl_context_new(struct _cl_context_prop *);
+extern cl_context cl_context_new(struct _cl_context_prop *prop, cl_uint dev_num, cl_device_id* all_dev);
/* Destroy and deallocate a context */
extern void cl_context_delete(cl_context);
@@ -149,12 +165,6 @@ extern void cl_context_delete(cl_context);
/* Increment the context reference counter */
extern void cl_context_add_ref(cl_context);
-/* Create the command queue from the given context and device */
-extern cl_command_queue cl_context_create_queue(cl_context,
- cl_device_id,
- cl_command_queue_properties,
- cl_int*);
-
/* Enqueue a ND Range kernel */
extern cl_int cl_context_ND_kernel(cl_context,
cl_command_queue,
@@ -171,5 +181,10 @@ extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
const char * str_kernel, size_t size, const char * str_option);
+/* Get the SVM from pointer, return NULL if pointer is not from SVM */
+extern cl_mem cl_context_get_svm_from_ptr(cl_context ctx, const void *p);
+/* Get the mem from pointer, return NULL if pointer is not from mem*/
+extern cl_mem cl_context_get_mem_from_ptr(cl_context ctx, const void *p);
+
#endif /* __CL_CONTEXT_H__ */
diff --git a/src/cl_device_enqueue.c b/src/cl_device_enqueue.c
new file mode 100644
index 0000000..b6932df
--- /dev/null
+++ b/src/cl_device_enqueue.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang<rong.r.yang at intel.com>
+ */
+#include "cl_device_enqueue.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_alloc.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+#include "cl_event.h"
+
+LOCAL cl_int
+cl_device_enqueue_fix_offset(cl_kernel ker) {
+ uint32_t i;
+ void *ptr;
+ cl_mem mem;
+ enum gbe_arg_type arg_type; /* kind of argument */
+ for (i = 0; i < ker->arg_n; ++i) {
+ arg_type = interp_kernel_get_arg_type(ker->opaque, i);
+ //HOW about image
+ if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !ker->args[i].mem)
+ continue;
+
+ if(!ker->args[i].is_svm) {
+ mem = ker->args[i].mem;
+ ptr = cl_mem_map(mem, 0);
+ cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr);
+ cl_buffer_set_bo_use_full_range(mem->bo, 1);
+ cl_buffer_disable_reuse(mem->bo);
+ mem->host_ptr = ptr;
+ cl_mem_unmap(mem);
+ ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ptr;
+ } else {
+ ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ker->args[i].mem->host_ptr;
+ }
+ }
+ return 0;
+}
+
+LOCAL cl_int
+cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti, cl_gpgpu_kernel *kernel)
+{
+ int32_t value = GBE_CURBE_ENQUEUE_BUF_POINTER;
+ int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+ size_t buf_size = 32 * 1024 * 1024; //fix 32M
+ cl_mem mem;
+
+ if(offset > 0) {
+ if(ker->useDeviceEnqueue == false) {
+ if(ker->device_enqueue_ptr == NULL)
+ ker->device_enqueue_ptr = cl_mem_svm_allocate(ker->program->ctx, 0, buf_size, 0);
+ if(ker->device_enqueue_infos == NULL)
+ ker->device_enqueue_infos = cl_calloc(ker->arg_n, sizeof(void *));
+ ker->device_enqueue_info_n = 0;
+ ker->useDeviceEnqueue = CL_TRUE;
+ cl_device_enqueue_fix_offset(ker);
+ cl_kernel_add_ref(ker);
+ }
+
+ mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+ assert(mem);
+ cl_gpgpu_bind_buf(gpgpu, mem->bo, offset, 0, buf_size, *max_bti);
+
+ cl_gpgpu_set_kernel(gpgpu, ker);
+ }
+ return 0;
+}
+
+typedef struct ndrange_info_t {
+ int type;
+ int global_work_size[3];
+ int local_work_size[3];
+ int global_work_offset[3];
+} ndrange_info_t;
+
+typedef struct Block_literal {
+ void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+ int flags;
+ int reserved;
+ int index;
+ struct Block_descriptor_1 {
+ unsigned long int slm_size; // NULL
+ unsigned long int size; // sizeof(struct Block_literal_1)
+ // optional helper functions
+ void *copy_helper; // IFF (1<<25)
+ void *dispose_helper; // IFF (1<<25)
+ // required ABI.2010.3.16
+ const char *signature; // IFF (1<<30)
+ } *descriptor;
+ // imported variables
+} Block_literal;
+
+LOCAL cl_int
+cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu)
+{
+ cl_mem mem;
+ int size, type, dim, i;
+ const char * kernel_name;
+ cl_kernel child_ker;
+ cl_event evt = NULL;
+
+ cl_kernel ker = cl_gpgpu_get_kernel(gpgpu);
+ if(ker == NULL || ker->useDeviceEnqueue == CL_FALSE)
+ return 0;
+
+ void *buf = cl_gpgpu_ref_batch_buf(gpgpu);
+ //wait the gpgpu's batch buf finish, the gpgpu in queue may be not
+ //same as the param gpgpu, for example when flush event.
+ cl_gpgpu_sync(buf);
+ cl_gpgpu_unref_batch_buf(buf);
+
+ mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+ if(mem == NULL) return -1;
+ char *ptr = (char *)cl_mem_map(mem, 0);
+
+ size = *(int *)ptr;
+ ptr += 4;
+ while(size > 0) {
+ size_t fixed_global_off[] = {0,0,0};
+ size_t fixed_global_sz[] = {1,1,1};
+ size_t fixed_local_sz[] = {1,1,1};
+ ndrange_info_t* ndrange_info = (ndrange_info_t *)ptr;
+ size -= sizeof(ndrange_info_t);
+ ptr += sizeof(ndrange_info_t);
+
+ Block_literal *block = (Block_literal *)ptr;
+ size -= block->descriptor->size;
+ ptr += block->descriptor->size;
+
+ type = ndrange_info->type;
+ dim = (type & 0xf0) >> 4;
+ type = type & 0xf;
+ assert(dim <= 2);
+ for(i = 0; i <= dim; i++) {
+ fixed_global_sz[i] = ndrange_info->global_work_size[i];
+ if(type > 1)
+ fixed_local_sz[i] = ndrange_info->local_work_size[i];
+ if(type > 2)
+ fixed_global_off[i] = ndrange_info->global_work_offset[i];
+ }
+
+ int *slm_sizes = (int *)ptr;
+ int slm_size = block->descriptor->slm_size;
+ size -= slm_size;
+ ptr += slm_size;
+
+ kernel_name = interp_program_get_device_enqueue_kernel_name(ker->program->opaque, block->index);
+ child_ker = cl_program_create_kernel(ker->program, kernel_name, NULL);
+ assert(child_ker);
+ cl_kernel_set_arg_svm_pointer(child_ker, 0, block);
+ int index = 1;
+ for(i=0; i<slm_size/sizeof(int); i++, index++) {
+ cl_kernel_set_arg(child_ker, index, slm_sizes[i], NULL);
+ }
+ cl_kernel_set_exec_info(child_ker, ker->device_enqueue_info_n * sizeof(void *),
+ ker->device_enqueue_infos);
+
+ if (evt != NULL) {
+ clReleaseEvent(evt);
+ evt = NULL;
+ }
+ clEnqueueNDRangeKernel(queue, child_ker, dim + 1, fixed_global_off,
+ fixed_global_sz, fixed_local_sz, 0, NULL, &evt);
+ cl_command_queue_flush_gpgpu(gpgpu);
+ cl_kernel_delete(child_ker);
+ }
+
+ if (evt != NULL) {
+ //Can't call clWaitForEvents here, it may cause dead lock.
+ //If evt->exec_data.gpgpu is NULL, evt has finished.
+ if (evt->exec_data.gpgpu) {
+ buf = cl_gpgpu_ref_batch_buf(evt->exec_data.gpgpu);
+ //wait the gpgpu's batch buf finish, the gpgpu in queue may be not
+ //same as the param gpgpu, for example when flush event.
+ cl_gpgpu_sync(buf);
+ cl_gpgpu_unref_batch_buf(buf);
+ }
+ clReleaseEvent(evt);
+ evt = NULL;
+ }
+ cl_mem_unmap_auto(mem);
+ cl_kernel_delete(ker);
+ return 0;
+}
diff --git a/backend/src/libocl/src/ocl_sync.cl b/src/cl_device_enqueue.h
similarity index 56%
copy from backend/src/libocl/src/ocl_sync.cl
copy to src/cl_device_enqueue.h
index b6efef8..17fc6c7 100644
--- a/backend/src/libocl/src/ocl_sync.cl
+++ b/src/cl_device_enqueue.h
@@ -1,5 +1,5 @@
/*
- * Copyright © 2012 - 2014 Intel Corporation
+ * Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -14,19 +14,18 @@
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <http://www.gnu.org/licenses/>.
*
+ * Author: Rong Yang<rong.r.yang at intel.com>
*/
-#include "ocl_sync.h"
-void __gen_ocl_barrier_local(void);
-void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
-void __gen_ocl_debugwait(void);
+#ifndef __CL_DEVICE_ENQUEUE_H__
+#define __CL_DEVICE_ENQUEUE_H__
-OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) {
-}
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+#include <stdint.h>
-OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags) {
-}
-
-OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags) {
-}
+extern cl_int cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker,
+ uint32_t *max_bti, cl_gpgpu_kernel *kernel);
+extern cl_int cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu);
+#endif /* __CL_DEVICE_ENQUEUE_H__ */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index ded2f1e..31f8616 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -24,7 +24,6 @@
#include "cl_driver.h"
#include "cl_device_data.h"
#include "cl_khr_icd.h"
-#include "cl_thread.h"
#include "CL/cl.h"
#include "CL/cl_ext.h"
#include "CL/cl_intel.h"
@@ -42,7 +41,6 @@
#endif
static struct _cl_device_id intel_ivb_gt2_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 16,
.max_thread_per_unit = 8,
.sub_slice_count = 2,
@@ -53,7 +51,6 @@ static struct _cl_device_id intel_ivb_gt2_device = {
};
static struct _cl_device_id intel_ivb_gt1_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 6,
.max_thread_per_unit = 6,
.sub_slice_count = 1,
@@ -64,7 +61,6 @@ static struct _cl_device_id intel_ivb_gt1_device = {
};
static struct _cl_device_id intel_baytrail_t_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 4,
.max_thread_per_unit = 8,
.sub_slice_count = 1,
@@ -76,7 +72,6 @@ static struct _cl_device_id intel_baytrail_t_device = {
/* XXX we clone IVB for HSW now */
static struct _cl_device_id intel_hsw_gt1_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 10,
.max_thread_per_unit = 7,
.sub_slice_count = 1,
@@ -87,7 +82,6 @@ static struct _cl_device_id intel_hsw_gt1_device = {
};
static struct _cl_device_id intel_hsw_gt2_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 20,
.max_thread_per_unit = 7,
.sub_slice_count = 2,
@@ -98,7 +92,6 @@ static struct _cl_device_id intel_hsw_gt2_device = {
};
static struct _cl_device_id intel_hsw_gt3_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 40,
.max_thread_per_unit = 7,
.sub_slice_count = 4,
@@ -110,7 +103,6 @@ static struct _cl_device_id intel_hsw_gt3_device = {
/* XXX we clone IVB for HSW now */
static struct _cl_device_id intel_brw_gt1_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 12,
.max_thread_per_unit = 7,
.sub_slice_count = 2,
@@ -121,7 +113,6 @@ static struct _cl_device_id intel_brw_gt1_device = {
};
static struct _cl_device_id intel_brw_gt2_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 24,
.max_thread_per_unit = 7,
.sub_slice_count = 3,
@@ -132,7 +123,6 @@ static struct _cl_device_id intel_brw_gt2_device = {
};
static struct _cl_device_id intel_brw_gt3_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 48,
.max_thread_per_unit = 7,
.sub_slice_count = 6,
@@ -144,7 +134,6 @@ static struct _cl_device_id intel_brw_gt3_device = {
//Cherryview has the same pciid, must get the max_compute_unit and max_thread_per_unit from drm
static struct _cl_device_id intel_chv_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 8,
.max_thread_per_unit = 7,
.sub_slice_count = 2,
@@ -156,7 +145,6 @@ static struct _cl_device_id intel_chv_device = {
/* XXX we clone brw now */
static struct _cl_device_id intel_skl_gt1_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 6,
.max_thread_per_unit = 7,
.sub_slice_count = 2,
@@ -167,7 +155,6 @@ static struct _cl_device_id intel_skl_gt1_device = {
};
static struct _cl_device_id intel_skl_gt2_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 24,
.max_thread_per_unit = 7,
.sub_slice_count = 3,
@@ -178,7 +165,6 @@ static struct _cl_device_id intel_skl_gt2_device = {
};
static struct _cl_device_id intel_skl_gt3_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 48,
.max_thread_per_unit = 7,
.sub_slice_count = 6,
@@ -189,7 +175,6 @@ static struct _cl_device_id intel_skl_gt3_device = {
};
static struct _cl_device_id intel_skl_gt4_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 72,
.max_thread_per_unit = 7,
.sub_slice_count = 9,
@@ -200,7 +185,6 @@ static struct _cl_device_id intel_skl_gt4_device = {
};
static struct _cl_device_id intel_bxt18eu_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 18,
.max_thread_per_unit = 6,
.sub_slice_count = 3,
@@ -221,7 +205,6 @@ static struct _cl_device_id intel_bxt12eu_device = {
};
static struct _cl_device_id intel_kbl_gt1_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 12,
.max_thread_per_unit = 7,
.sub_slice_count = 2,
@@ -232,7 +215,6 @@ static struct _cl_device_id intel_kbl_gt1_device = {
};
static struct _cl_device_id intel_kbl_gt15_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 18,
.max_thread_per_unit = 7,
.sub_slice_count = 3,
@@ -243,7 +225,6 @@ static struct _cl_device_id intel_kbl_gt15_device = {
};
static struct _cl_device_id intel_kbl_gt2_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 24,
.max_thread_per_unit = 7,
.sub_slice_count = 3,
@@ -254,7 +235,6 @@ static struct _cl_device_id intel_kbl_gt2_device = {
};
static struct _cl_device_id intel_kbl_gt3_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 48,
.max_thread_per_unit = 7,
.sub_slice_count = 6,
@@ -265,7 +245,6 @@ static struct _cl_device_id intel_kbl_gt3_device = {
};
static struct _cl_device_id intel_kbl_gt4_device = {
- INIT_ICD(dispatch)
.max_compute_unit = 72,
.max_thread_per_unit = 7,
.sub_slice_count = 9,
@@ -774,6 +753,7 @@ kbl_gt4_break:
if (ret == NULL)
return NULL;
+ CL_OBJECT_INIT_BASE(ret, CL_OBJECT_DEVICE_MAGIC);
if (!CompilerSupported()) {
ret->compiler_available = CL_FALSE;
//ret->linker_available = CL_FALSE;
@@ -840,7 +820,7 @@ cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
return ret;
cl_driver_set_atomic_flag(ctx->drv, atomic_in_l3_flag);
if (status == CL_SUCCESS) {
- queue = clCreateCommandQueue(ctx, device, 0, &status);
+ queue = clCreateCommandQueueWithProperties(ctx, device, 0, &status);
if (status == CL_SUCCESS) {
program = clCreateProgramWithSource(ctx, 1, &kernel_source, NULL, &status);
if (status == CL_SUCCESS) {
@@ -937,30 +917,6 @@ cl_get_device_ids(cl_platform_id platform,
}
}
-#define DECL_FIELD(CASE,FIELD) \
- case JOIN(CL_DEVICE_,CASE): \
- if (param_value_size_ret) { \
- *param_value_size_ret = sizeof device->FIELD; \
- if (!param_value) \
- return CL_SUCCESS; \
- } \
- if (param_value_size < sizeof device->FIELD) \
- return CL_INVALID_VALUE; \
- memcpy(param_value, &device->FIELD, sizeof device->FIELD); \
- return CL_SUCCESS;
-
-#define DECL_STRING_FIELD(CASE,FIELD) \
- case JOIN(CL_DEVICE_,CASE): \
- if (param_value_size_ret) { \
- *param_value_size_ret = device->JOIN(FIELD,_sz); \
- if (!param_value) \
- return CL_SUCCESS; \
- } \
- if (param_value_size < device->JOIN(FIELD,_sz)) \
- return CL_INVALID_VALUE; \
- memcpy(param_value, device->FIELD, device->JOIN(FIELD,_sz)); \
- return CL_SUCCESS;
-
LOCAL cl_bool is_gen_device(cl_device_id device) {
return device == &intel_ivb_gt1_device ||
device == &intel_ivb_gt2_device ||
@@ -992,101 +948,394 @@ cl_get_device_info(cl_device_id device,
void * param_value,
size_t * param_value_size_ret)
{
+ const void *src_ptr = NULL;
+ size_t src_size = 0;
+ cl_int dev_ref;
+
+ // We now just support gen devices.
if (UNLIKELY(is_gen_device(device) == CL_FALSE))
return CL_INVALID_DEVICE;
/* Find the correct parameter */
switch (param_name) {
- DECL_FIELD(TYPE, device_type)
- DECL_FIELD(VENDOR_ID, vendor_id)
- DECL_FIELD(MAX_COMPUTE_UNITS, max_compute_unit)
- DECL_FIELD(MAX_WORK_ITEM_DIMENSIONS, max_work_item_dimensions)
- DECL_FIELD(MAX_WORK_ITEM_SIZES, max_work_item_sizes)
- DECL_FIELD(MAX_WORK_GROUP_SIZE, max_work_group_size)
- DECL_FIELD(PREFERRED_VECTOR_WIDTH_CHAR, preferred_vector_width_char)
- DECL_FIELD(PREFERRED_VECTOR_WIDTH_SHORT, preferred_vector_width_short)
- DECL_FIELD(PREFERRED_VECTOR_WIDTH_INT, preferred_vector_width_int)
- DECL_FIELD(PREFERRED_VECTOR_WIDTH_LONG, preferred_vector_width_long)
- DECL_FIELD(PREFERRED_VECTOR_WIDTH_FLOAT, preferred_vector_width_float)
- DECL_FIELD(PREFERRED_VECTOR_WIDTH_DOUBLE, preferred_vector_width_double)
- DECL_FIELD(PREFERRED_VECTOR_WIDTH_HALF, preferred_vector_width_half)
- DECL_FIELD(NATIVE_VECTOR_WIDTH_CHAR, native_vector_width_char)
- DECL_FIELD(NATIVE_VECTOR_WIDTH_SHORT, native_vector_width_short)
- DECL_FIELD(NATIVE_VECTOR_WIDTH_INT, native_vector_width_int)
- DECL_FIELD(NATIVE_VECTOR_WIDTH_LONG, native_vector_width_long)
- DECL_FIELD(NATIVE_VECTOR_WIDTH_FLOAT, native_vector_width_float)
- DECL_FIELD(NATIVE_VECTOR_WIDTH_DOUBLE, native_vector_width_double)
- DECL_FIELD(NATIVE_VECTOR_WIDTH_HALF, native_vector_width_half)
- DECL_FIELD(MAX_CLOCK_FREQUENCY, max_clock_frequency)
- DECL_FIELD(ADDRESS_BITS, address_bits)
- DECL_FIELD(MAX_MEM_ALLOC_SIZE, max_mem_alloc_size)
- DECL_FIELD(IMAGE_SUPPORT, image_support)
- DECL_FIELD(MAX_READ_IMAGE_ARGS, max_read_image_args)
- DECL_FIELD(MAX_WRITE_IMAGE_ARGS, max_write_image_args)
- DECL_FIELD(IMAGE_MAX_ARRAY_SIZE, image_max_array_size)
- DECL_FIELD(IMAGE2D_MAX_WIDTH, image2d_max_width)
- DECL_FIELD(IMAGE2D_MAX_HEIGHT, image2d_max_height)
- DECL_FIELD(IMAGE3D_MAX_WIDTH, image3d_max_width)
- DECL_FIELD(IMAGE3D_MAX_HEIGHT, image3d_max_height)
- DECL_FIELD(IMAGE3D_MAX_DEPTH, image3d_max_depth)
- DECL_FIELD(MAX_SAMPLERS, max_samplers)
- DECL_FIELD(MAX_PARAMETER_SIZE, max_parameter_size)
- DECL_FIELD(MEM_BASE_ADDR_ALIGN, mem_base_addr_align)
- DECL_FIELD(MIN_DATA_TYPE_ALIGN_SIZE, min_data_type_align_size)
- DECL_FIELD(SINGLE_FP_CONFIG, single_fp_config)
- DECL_FIELD(HALF_FP_CONFIG, half_fp_config)
- DECL_FIELD(DOUBLE_FP_CONFIG, double_fp_config)
- DECL_FIELD(GLOBAL_MEM_CACHE_TYPE, global_mem_cache_type)
- DECL_FIELD(GLOBAL_MEM_CACHELINE_SIZE, global_mem_cache_line_size)
- DECL_FIELD(GLOBAL_MEM_CACHE_SIZE, global_mem_cache_size)
- DECL_FIELD(GLOBAL_MEM_SIZE, global_mem_size)
- DECL_FIELD(MAX_CONSTANT_BUFFER_SIZE, max_constant_buffer_size)
- DECL_FIELD(IMAGE_MAX_BUFFER_SIZE, image_mem_size)
- DECL_FIELD(MAX_CONSTANT_ARGS, max_constant_args)
- DECL_FIELD(LOCAL_MEM_TYPE, local_mem_type)
- DECL_FIELD(LOCAL_MEM_SIZE, local_mem_size)
- DECL_FIELD(ERROR_CORRECTION_SUPPORT, error_correction_support)
- DECL_FIELD(HOST_UNIFIED_MEMORY, host_unified_memory)
- DECL_FIELD(PROFILING_TIMER_RESOLUTION, profiling_timer_resolution)
- DECL_FIELD(ENDIAN_LITTLE, endian_little)
- DECL_FIELD(AVAILABLE, available)
- DECL_FIELD(COMPILER_AVAILABLE, compiler_available)
- DECL_FIELD(LINKER_AVAILABLE, linker_available)
- DECL_FIELD(EXECUTION_CAPABILITIES, execution_capabilities)
- DECL_FIELD(QUEUE_PROPERTIES, queue_properties)
- DECL_FIELD(PLATFORM, platform)
- DECL_FIELD(PRINTF_BUFFER_SIZE, printf_buffer_size)
- DECL_FIELD(PREFERRED_INTEROP_USER_SYNC, interop_user_sync)
- DECL_STRING_FIELD(NAME, name)
- DECL_STRING_FIELD(VENDOR, vendor)
- DECL_STRING_FIELD(VERSION, version)
- DECL_STRING_FIELD(PROFILE, profile)
- DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
- DECL_STRING_FIELD(SPIR_VERSIONS, spir_versions)
- DECL_STRING_FIELD(EXTENSIONS, extensions);
- DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
- DECL_FIELD(PARENT_DEVICE, parent_device)
- DECL_FIELD(PARTITION_MAX_SUB_DEVICES, partition_max_sub_device)
- DECL_FIELD(PARTITION_PROPERTIES, partition_property)
- DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
- DECL_FIELD(PARTITION_TYPE, partition_type)
- DECL_FIELD(REFERENCE_COUNT, device_reference_count)
- DECL_FIELD(IMAGE_PITCH_ALIGNMENT, image_pitch_alignment)
- DECL_FIELD(IMAGE_BASE_ADDRESS_ALIGNMENT, image_base_address_alignment)
-
- case CL_DRIVER_VERSION:
- if (param_value_size_ret) {
- *param_value_size_ret = device->driver_version_sz;
- if (!param_value)
- return CL_SUCCESS;
+ case CL_DEVICE_TYPE:
+ src_ptr = &device->device_type;
+ src_size = sizeof(device->device_type);
+ break;
+ case CL_DEVICE_VENDOR_ID:
+ src_ptr = &device->vendor_id;
+ src_size = sizeof(device->vendor_id);
+ break;
+ case CL_DEVICE_MAX_COMPUTE_UNITS:
+ src_ptr = &device->max_compute_unit;
+ src_size = sizeof(device->max_compute_unit);
+ break;
+ case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+ src_ptr = &device->max_work_item_dimensions;
+ src_size = sizeof(device->max_work_item_dimensions);
+ break;
+ case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+ src_ptr = &device->max_work_item_sizes;
+ src_size = sizeof(device->max_work_item_sizes);
+ break;
+ case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+ src_ptr = &device->max_work_group_size;
+ src_size = sizeof(device->max_work_group_size);
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+ src_ptr = &device->preferred_vector_width_char;
+ src_size = sizeof(device->preferred_vector_width_char);
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+ src_ptr = &device->preferred_vector_width_short;
+ src_size = sizeof(device->preferred_vector_width_short);
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+ src_ptr = &device->preferred_vector_width_int;
+ src_size = sizeof(device->preferred_vector_width_int);
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+ src_ptr = &device->preferred_vector_width_long;
+ src_size = sizeof(device->preferred_vector_width_long);
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+ src_ptr = &device->preferred_vector_width_float;
+ src_size = sizeof(device->preferred_vector_width_float);
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+ src_ptr = &device->preferred_vector_width_double;
+ src_size = sizeof(device->preferred_vector_width_double);
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+ src_ptr = &device->preferred_vector_width_half;
+ src_size = sizeof(device->preferred_vector_width_half);
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+ src_ptr = &device->native_vector_width_char;
+ src_size = sizeof(device->native_vector_width_char);
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+ src_ptr = &device->native_vector_width_short;
+ src_size = sizeof(device->native_vector_width_short);
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+ src_ptr = &device->native_vector_width_int;
+ src_size = sizeof(device->native_vector_width_int);
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+ src_ptr = &device->native_vector_width_long;
+ src_size = sizeof(device->native_vector_width_long);
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+ src_ptr = &device->native_vector_width_float;
+ src_size = sizeof(device->native_vector_width_float);
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+ src_ptr = &device->native_vector_width_double;
+ src_size = sizeof(device->native_vector_width_double);
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+ src_ptr = &device->native_vector_width_half;
+ src_size = sizeof(device->native_vector_width_half);
+ break;
+ case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+ src_ptr = &device->max_clock_frequency;
+ src_size = sizeof(device->max_clock_frequency);
+ break;
+ case CL_DEVICE_ADDRESS_BITS:
+ src_ptr = &device->address_bits;
+ src_size = sizeof(device->address_bits);
+ break;
+ case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+ src_ptr = &device->max_mem_alloc_size;
+ src_size = sizeof(device->max_mem_alloc_size);
+ break;
+ case CL_DEVICE_IMAGE_SUPPORT:
+ src_ptr = &device->image_support;
+ src_size = sizeof(device->image_support);
+ break;
+ case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+ src_ptr = &device->max_read_image_args;
+ src_size = sizeof(device->max_read_image_args);
+ break;
+ case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+ src_ptr = &device->max_write_image_args;
+ src_size = sizeof(device->max_write_image_args);
+ break;
+ case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS:
+ src_ptr = &device->max_read_write_image_args;
+ src_size = sizeof(device->max_read_write_image_args);
+ break;
+ case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE:
+ src_ptr = &device->image_max_array_size;
+ src_size = sizeof(device->image_max_array_size);
+ break;
+ case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+ src_ptr = &device->image2d_max_width;
+ src_size = sizeof(device->image2d_max_width);
+ break;
+ case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+ src_ptr = &device->image2d_max_height;
+ src_size = sizeof(device->image2d_max_height);
+ break;
+ case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+ src_ptr = &device->image3d_max_width;
+ src_size = sizeof(device->image3d_max_width);
+ break;
+ case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+ src_ptr = &device->image3d_max_height;
+ src_size = sizeof(device->image3d_max_height);
+ break;
+ case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+ src_ptr = &device->image3d_max_depth;
+ src_size = sizeof(device->image3d_max_depth);
+ break;
+ case CL_DEVICE_MAX_SAMPLERS:
+ src_ptr = &device->max_samplers;
+ src_size = sizeof(device->max_samplers);
+ break;
+ case CL_DEVICE_MAX_PARAMETER_SIZE:
+ src_ptr = &device->max_parameter_size;
+ src_size = sizeof(device->max_parameter_size);
+ break;
+ case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+ src_ptr = &device->mem_base_addr_align;
+ src_size = sizeof(device->mem_base_addr_align);
+ break;
+ case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+ src_ptr = &device->min_data_type_align_size;
+ src_size = sizeof(device->min_data_type_align_size);
+ break;
+ case CL_DEVICE_MAX_PIPE_ARGS:
+ src_ptr = &device->max_pipe_args;
+ src_size = sizeof(device->max_pipe_args);
+ break;
+ case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS:
+ src_ptr = &device->pipe_max_active_reservations;
+ src_size = sizeof(device->pipe_max_active_reservations);
+ break;
+ case CL_DEVICE_PIPE_MAX_PACKET_SIZE:
+ src_ptr = &device->pipe_max_packet_siz;
+ src_size = sizeof(device->pipe_max_packet_siz);
+ break;
+ case CL_DEVICE_SINGLE_FP_CONFIG:
+ src_ptr = &device->single_fp_config;
+ src_size = sizeof(device->single_fp_config);
+ break;
+ case CL_DEVICE_HALF_FP_CONFIG:
+ src_ptr = &device->half_fp_config;
+ src_size = sizeof(device->half_fp_config);
+ break;
+ case CL_DEVICE_DOUBLE_FP_CONFIG:
+ src_ptr = &device->double_fp_config;
+ src_size = sizeof(device->double_fp_config);
+ break;
+ case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+ src_ptr = &device->global_mem_cache_type;
+ src_size = sizeof(device->global_mem_cache_type);
+ break;
+ case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+ src_ptr = &device->global_mem_cache_line_size;
+ src_size = sizeof(device->global_mem_cache_line_size);
+ break;
+ case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+ src_ptr = &device->global_mem_cache_size;
+ src_size = sizeof(device->global_mem_cache_size);
+ break;
+ case CL_DEVICE_GLOBAL_MEM_SIZE:
+ src_ptr = &device->global_mem_size;
+ src_size = sizeof(device->global_mem_size);
+ break;
+ case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+ src_ptr = &device->max_constant_buffer_size;
+ src_size = sizeof(device->max_constant_buffer_size);
+ break;
+ case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE:
+ src_ptr = &device->image_mem_size;
+ src_size = sizeof(device->image_mem_size);
+ break;
+ case CL_DEVICE_MAX_CONSTANT_ARGS:
+ src_ptr = &device->max_constant_args;
+ src_size = sizeof(device->max_constant_args);
+ break;
+ case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE:
+ src_ptr = &device->max_global_variable_size;
+ src_size = sizeof(device->max_global_variable_size);
+ break;
+ case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE:
+ src_ptr = &device->global_variable_preferred_total_size;
+ src_size = sizeof(device->global_variable_preferred_total_size);
+ break;
+ case CL_DEVICE_LOCAL_MEM_TYPE:
+ src_ptr = &device->local_mem_type;
+ src_size = sizeof(device->local_mem_type);
+ break;
+ case CL_DEVICE_LOCAL_MEM_SIZE:
+ src_ptr = &device->local_mem_size;
+ src_size = sizeof(device->local_mem_size);
+ break;
+ case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+ src_ptr = &device->error_correction_support;
+ src_size = sizeof(device->error_correction_support);
+ break;
+ case CL_DEVICE_HOST_UNIFIED_MEMORY:
+ src_ptr = &device->host_unified_memory;
+ src_size = sizeof(device->host_unified_memory);
+ break;
+ case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+ src_ptr = &device->profiling_timer_resolution;
+ src_size = sizeof(device->profiling_timer_resolution);
+ break;
+ case CL_DEVICE_ENDIAN_LITTLE:
+ src_ptr = &device->endian_little;
+ src_size = sizeof(device->endian_little);
+ break;
+ case CL_DEVICE_AVAILABLE:
+ src_ptr = &device->available;
+ src_size = sizeof(device->available);
+ break;
+ case CL_DEVICE_COMPILER_AVAILABLE:
+ src_ptr = &device->compiler_available;
+ src_size = sizeof(device->compiler_available);
+ break;
+ case CL_DEVICE_LINKER_AVAILABLE:
+ src_ptr = &device->linker_available;
+ src_size = sizeof(device->linker_available);
+ break;
+ case CL_DEVICE_EXECUTION_CAPABILITIES:
+ src_ptr = &device->execution_capabilities;
+ src_size = sizeof(device->execution_capabilities);
+ break;
+ case CL_DEVICE_QUEUE_PROPERTIES:
+ src_ptr = &device->queue_properties;
+ src_size = sizeof(device->queue_properties);
+ break;
+ case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES:
+ src_ptr = &device->queue_on_device_properties;
+ src_size = sizeof(device->queue_on_device_properties);
+ break;
+ case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE:
+ src_ptr = &device->queue_on_device_preferred_size;
+ src_size = sizeof(device->queue_on_device_preferred_size);
+ break;
+ case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE:
+ src_ptr = &device->queue_on_device_max_size;
+ src_size = sizeof(device->queue_on_device_max_size);
+ break;
+ case CL_DEVICE_MAX_ON_DEVICE_QUEUES:
+ src_ptr = &device->max_on_device_queues;
+ src_size = sizeof(device->max_on_device_queues);
+ break;
+ case CL_DEVICE_MAX_ON_DEVICE_EVENTS:
+ src_ptr = &device->max_on_device_events;
+ src_size = sizeof(device->max_on_device_events);
+ break;
+ case CL_DEVICE_PLATFORM:
+ src_ptr = &device->platform;
+ src_size = sizeof(device->platform);
+ break;
+ case CL_DEVICE_PRINTF_BUFFER_SIZE:
+ src_ptr = &device->printf_buffer_size;
+ src_size = sizeof(device->printf_buffer_size);
+ break;
+ case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC:
+ src_ptr = &device->interop_user_sync;
+ src_size = sizeof(device->interop_user_sync);
+ break;
+ case CL_DEVICE_NAME:
+ src_ptr = device->name;
+ src_size = device->name_sz;
+ break;
+ case CL_DEVICE_VENDOR:
+ src_ptr = device->vendor;
+ src_size = device->vendor_sz;
+ break;
+ case CL_DEVICE_VERSION:
+ src_ptr = device->version;
+ src_size = device->version_sz;
+ break;
+ case CL_DEVICE_PROFILE:
+ src_ptr = device->profile;
+ src_size = device->profile_sz;
+ break;
+ case CL_DEVICE_OPENCL_C_VERSION:
+ src_ptr = device->opencl_c_version;
+ src_size = device->opencl_c_version_sz;
+ break;
+ case CL_DEVICE_SPIR_VERSIONS:
+ src_ptr = device->spir_versions;
+ src_size = device->spir_versions_sz;
+ break;
+ case CL_DEVICE_EXTENSIONS:
+ src_ptr = device->extensions;
+ src_size = device->extensions_sz;
+ break;
+ case CL_DEVICE_BUILT_IN_KERNELS:
+ src_ptr = device->built_in_kernels;
+ src_size = device->built_in_kernels_sz;
+ break;
+ case CL_DEVICE_PARENT_DEVICE:
+ src_ptr = &device->parent_device;
+ src_size = sizeof(device->parent_device);
+ break;
+ case CL_DEVICE_PARTITION_MAX_SUB_DEVICES:
+ src_ptr = &device->partition_max_sub_device;
+ src_size = sizeof(device->partition_max_sub_device);
+ break;
+ case CL_DEVICE_PARTITION_PROPERTIES:
+ src_ptr = &device->partition_property;
+ src_size = sizeof(device->partition_property);
+ break;
+ case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
+ src_ptr = &device->affinity_domain;
+ src_size = sizeof(device->affinity_domain);
+ break;
+ case CL_DEVICE_PARTITION_TYPE:
+ src_ptr = &device->partition_type;
+ src_size = sizeof(device->partition_type);
+ break;
+ case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT:
+ src_ptr = &device->preferred_platform_atomic_alignment;
+ src_size = sizeof(device->preferred_platform_atomic_alignment);
+ break;
+ case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT:
+ src_ptr = &device->preferred_global_atomic_alignment;
+ src_size = sizeof(device->preferred_global_atomic_alignment);
+ break;
+ case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT:
+ src_ptr = &device->preferred_local_atomic_alignment;
+ src_size = sizeof(device->preferred_local_atomic_alignment);
+ break;
+ case CL_DEVICE_IMAGE_PITCH_ALIGNMENT:
+ src_ptr = &device->image_pitch_alignment;
+ src_size = sizeof(device->image_pitch_alignment);
+ break;
+ case CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT:
+ src_ptr = &device->image_base_address_alignment;
+ src_size = sizeof(device->image_base_address_alignment);
+ break;
+ case CL_DEVICE_SVM_CAPABILITIES:
+ src_ptr = &device->svm_capabilities;
+ src_size = sizeof(device->svm_capabilities);
+ break;
+ case CL_DEVICE_REFERENCE_COUNT:
+ {
+ dev_ref = CL_OBJECT_GET_REF(device);
+ src_ptr = &dev_ref;
+ src_size = sizeof(cl_int);
+ break;
}
- if (param_value_size < device->driver_version_sz)
- return CL_INVALID_VALUE;
- memcpy(param_value, device->driver_version, device->driver_version_sz);
- return CL_SUCCESS;
+ case CL_DRIVER_VERSION:
+ src_ptr = device->driver_version;
+ src_size = device->driver_version_sz;
+ break;
- default: return CL_INVALID_VALUE;
- };
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ return cl_get_info_helper(src_ptr, src_size,
+ param_value, param_value_size, param_value_size_ret);
}
LOCAL cl_int
@@ -1140,7 +1389,7 @@ cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
const char * n = cl_kernel_get_name(kernel);
const char * builtin_kernels_2d = "__cl_copy_image_2d_to_2d;__cl_copy_image_2d_to_buffer;__cl_copy_buffer_to_image_2d;__cl_fill_image_2d;__cl_fill_image_2d_array;";
const char * builtin_kernels_3d = "__cl_copy_image_3d_to_2d;__cl_copy_image_2d_to_3d;__cl_copy_image_3d_to_3d;__cl_copy_image_3d_to_buffer;__cl_copy_buffer_to_image_3d;__cl_fill_image_3d";
- if (!strstr(device->built_in_kernels, n)){
+ if (n == NULL || !strstr(device->built_in_kernels, n)){
return 0;
}else if(strstr(builtin_kernels_2d, n)){
return 2;
@@ -1156,22 +1405,22 @@ cl_get_kernel_max_wg_sz(cl_kernel kernel)
{
size_t work_group_size, thread_cnt;
int simd_width = interp_kernel_get_simd_width(kernel->opaque);
- int device_id = kernel->program->ctx->device->device_id;
+ int device_id = kernel->program->ctx->devices[0]->device_id;
if (!interp_kernel_use_slm(kernel->opaque)) {
if (!IS_BAYTRAIL_T(device_id) || simd_width == 16)
work_group_size = simd_width * 64;
else
- work_group_size = kernel->program->ctx->device->max_compute_unit *
- kernel->program->ctx->device->max_thread_per_unit * simd_width;
+ work_group_size = kernel->program->ctx->devices[0]->max_compute_unit *
+ kernel->program->ctx->devices[0]->max_thread_per_unit * simd_width;
} else {
- thread_cnt = kernel->program->ctx->device->max_compute_unit *
- kernel->program->ctx->device->max_thread_per_unit / kernel->program->ctx->device->sub_slice_count;
+ thread_cnt = kernel->program->ctx->devices[0]->max_compute_unit *
+ kernel->program->ctx->devices[0]->max_thread_per_unit / kernel->program->ctx->devices[0]->sub_slice_count;
if(thread_cnt > 64)
thread_cnt = 64;
work_group_size = thread_cnt * simd_width;
}
- if(work_group_size > kernel->program->ctx->device->max_work_group_size)
- work_group_size = kernel->program->ctx->device->max_work_group_size;
+ if(work_group_size > kernel->program->ctx->devices[0]->max_work_group_size)
+ work_group_size = kernel->program->ctx->devices[0]->max_work_group_size;
return work_group_size;
}
@@ -1187,7 +1436,7 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
int dimension = 0;
CHECK_KERNEL(kernel);
if (device == NULL)
- device = kernel->program->ctx->device;
+ device = kernel->program->ctx->devices[0];
if (UNLIKELY(is_gen_device(device) == CL_FALSE))
return CL_INVALID_DEVICE;
@@ -1259,7 +1508,7 @@ cl_get_kernel_subgroup_info(cl_kernel kernel,
{
int err = CL_SUCCESS;
if(device != NULL)
- if (kernel->program->ctx->device != device)
+ if (kernel->program->ctx->devices[0] != device)
return CL_INVALID_DEVICE;
CHECK_KERNEL(kernel);
@@ -1329,3 +1578,54 @@ cl_get_kernel_subgroup_info(cl_kernel kernel,
error:
return err;
}
+
+LOCAL cl_int
+cl_devices_list_check(cl_uint num_devices, const cl_device_id *devices)
+{
+ cl_uint i;
+
+ if (devices == NULL)
+ return CL_INVALID_DEVICE;
+
+ assert(num_devices > 0);
+ for (i = 0; i < num_devices; i++) {
+ if (!CL_OBJECT_IS_DEVICE(devices[i])) {
+ return CL_INVALID_DEVICE;
+ }
+
+ if (devices[i]->available == CL_FALSE) {
+ return CL_DEVICE_NOT_AVAILABLE;
+ }
+
+ // We now just support one platform.
+ if (devices[i]->platform != cl_get_platform_default()) {
+ return CL_INVALID_DEVICE;
+ }
+
+ // TODO: We now just support Gen Device.
+ if (devices[i] != cl_get_gt_device()) {
+ return CL_INVALID_DEVICE;
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_devices_list_include_check(cl_uint num_devices, const cl_device_id *devices,
+ cl_uint num_to_check, const cl_device_id *devices_to_check)
+{
+ cl_uint i, j;
+
+ for (i = 0; i < num_to_check; i++) {
+ for (j = 0; j < num_devices; j++) {
+ if (devices_to_check[i] == devices[j])
+ break;
+ }
+
+ if (j == num_devices)
+ return CL_INVALID_DEVICE;
+ }
+
+ return CL_SUCCESS;
+}
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 7db125b..9d8b512 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -22,10 +22,10 @@
#define EXTENSTION_LENGTH 512
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
/* Store complete information about the device */
struct _cl_device_id {
- DEFINE_ICD(dispatch)
+ _cl_base_object base;
cl_device_type device_type;
cl_uint device_id;
cl_uint vendor_id;
@@ -55,9 +55,14 @@ struct _cl_device_id {
cl_uint max_clock_frequency;
cl_uint address_bits;
cl_ulong max_mem_alloc_size;
+ cl_device_svm_capabilities svm_capabilities;
+ cl_uint preferred_platform_atomic_alignment;
+ cl_uint preferred_global_atomic_alignment;
+ cl_uint preferred_local_atomic_alignment;
cl_bool image_support;
cl_uint max_read_image_args;
cl_uint max_write_image_args;
+ cl_uint max_read_write_image_args;
size_t image2d_max_width;
size_t image_max_array_size;
size_t image2d_max_height;
@@ -69,6 +74,9 @@ struct _cl_device_id {
size_t max_parameter_size;
cl_uint mem_base_addr_align;
cl_uint min_data_type_align_size;
+ cl_uint max_pipe_args;
+ cl_uint pipe_max_active_reservations;
+ cl_uint pipe_max_packet_siz;
cl_device_fp_config single_fp_config;
cl_device_fp_config half_fp_config;
cl_device_fp_config double_fp_config;
@@ -78,6 +86,8 @@ struct _cl_device_id {
cl_ulong global_mem_size;
cl_ulong max_constant_buffer_size;
cl_uint max_constant_args;
+ size_t max_global_variable_size;
+ size_t global_variable_preferred_total_size;
cl_device_local_mem_type local_mem_type;
cl_ulong local_mem_size;
cl_ulong scratch_mem_size;
@@ -90,6 +100,12 @@ struct _cl_device_id {
cl_bool linker_available;
cl_device_exec_capabilities execution_capabilities;
cl_command_queue_properties queue_properties;
+ cl_command_queue_properties queue_on_host_properties;
+ cl_command_queue_properties queue_on_device_properties;
+ cl_uint queue_on_device_preferred_size;
+ cl_uint queue_on_device_max_size;
+ cl_uint max_on_device_queues;
+ cl_uint max_on_device_events;
cl_platform_id platform;
size_t printf_buffer_size;
cl_bool interop_user_sync;
@@ -117,15 +133,19 @@ struct _cl_device_id {
cl_device_partition_property partition_property[3];
cl_device_affinity_domain affinity_domain;
cl_device_partition_property partition_type[3];
- cl_uint device_reference_count;
uint32_t atomic_test_result;
- uint32_t image_pitch_alignment;
- uint32_t image_base_address_alignment;
+ cl_uint image_pitch_alignment;
+ cl_uint image_base_address_alignment;
//inited as NULL, created only when cmrt kernel is used
void* cmrt_device; //realtype: CmDevice*
};
+#define CL_OBJECT_DEVICE_MAGIC 0x2acaddcca8853c52LL
+#define CL_OBJECT_IS_DEVICE(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_DEVICE_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
+
/* Get a device from the given platform */
extern cl_int cl_get_device_ids(cl_platform_id platform,
cl_device_type device_type,
@@ -162,5 +182,9 @@ extern cl_int cl_get_kernel_subgroup_info(cl_kernel kernel,
extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
+extern cl_int cl_devices_list_check(cl_uint num_devices, const cl_device_id *devices);
+extern cl_int cl_devices_list_include_check(cl_uint num_devices, const cl_device_id *devices,
+ cl_uint num_to_check, const cl_device_id *devices_to_check);
+
#endif /* __CL_DEVICE_ID_H__ */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 584be9d..07c5f7f 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -123,7 +123,7 @@ typedef enum gpu_command_status {
typedef struct cl_gpgpu_kernel {
const char *name; /* kernel name and bo name */
uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
- uint32_t curbe_sz; /* total size of all curbes */
+ uint32_t curbe_sz; /* total size of all curbes */
cl_buffer bo; /* kernel code in the proper addr space */
int32_t barrierID; /* barrierID for _this_ kernel */
uint32_t use_slm:1; /* For gen7 (automatic barrier management) */
@@ -147,6 +147,12 @@ extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, size_t size, uint8_t bti);
extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
+typedef void (cl_gpgpu_set_kernel_cb)(cl_gpgpu, void *);
+extern cl_gpgpu_set_kernel_cb *cl_gpgpu_set_kernel;
+
+typedef void* (cl_gpgpu_get_kernel_cb)(cl_gpgpu);
+extern cl_gpgpu_get_kernel_cb *cl_gpgpu_get_kernel;
+
/* bind samplers defined in both kernel and kernel args. */
typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
@@ -262,11 +268,11 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
/* Get a event time stamp */
-typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, cl_gpgpu_event, int, uint64_t*);
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, int, uint64_t*);
extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
/* Get current GPU time stamp */
-typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_driver, uint64_t*);
extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
/* Get current batch buffer handle */
@@ -326,10 +332,10 @@ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
+ const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3]);
extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
-
/**************************************************************************
* Buffer
**************************************************************************/
@@ -340,8 +346,17 @@ extern cl_buffer_alloc_cb *cl_buffer_alloc;
typedef cl_buffer (cl_buffer_alloc_userptr_cb)(cl_buffer_mgr, const char*, void *, size_t, unsigned long);
extern cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr;
+typedef int (cl_buffer_set_softpin_offset_cb)(cl_buffer, uint64_t);
+extern cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset;
+
+typedef int (cl_buffer_set_bo_use_full_range_cb)(cl_buffer, uint32_t);
+extern cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range;
+
+typedef int (cl_buffer_disable_reuse_cb)(cl_buffer);
+extern cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse;
+
/* Set a buffer's tiling mode */
-typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
+typedef int (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
#include "cl_context.h"
@@ -351,7 +366,7 @@ typedef cl_buffer (cl_buffer_alloc_from_texture_cb)(cl_context, unsigned int, in
struct _cl_mem_image *gl_image);
extern cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture;
-typedef void (cl_buffer_release_from_texture_cb)(cl_context, unsigned int, int, unsigned int);
+typedef void (cl_buffer_release_from_texture_cb)(cl_context, struct _cl_mem_gl_image *);
extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
typedef cl_buffer (cl_buffer_get_buffer_from_libva_cb)(cl_context ctx, unsigned int bo_name, size_t *sz);
@@ -436,35 +451,5 @@ extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
typedef void (cl_driver_update_device_info_cb)(cl_device_id device);
extern cl_driver_update_device_info_cb *cl_driver_update_device_info;
-/**************************************************************************
- * cl_khr_gl_sharing.
- **************************************************************************/
-typedef int (cl_gl_acquire_texture_cb)(void *driver, void *ctx, int target,
- int level, int texture, void*user_data);
-extern cl_gl_acquire_texture_cb *cl_gl_acquire_texture;
-
-typedef int (cl_gl_release_texture_cb)(void *driver, void *ctx, int target,
- int level, int texture);
-extern cl_gl_release_texture_cb *cl_gl_release_texture;
-
-typedef int (cl_gl_acquire_buffer_object_cb)(void *driver, void *ctx,
- int bufobj, void* user_data);
-extern cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object;
-
-typedef int (cl_gl_release_buffer_object_cb)(void *driver, void *ctx, int bufobj);
-extern cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object;
-
-typedef int (cl_gl_acquire_render_buffer_cb)(void *driver, void *ctx,
- int rb, void* user_data);
-extern cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer;
-
-typedef int (cl_gl_release_render_buffer_cb)(void *driver, void *ctx, int rb);
-extern cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer;
-
-#ifndef DEFAULT_DRIVER_DIR
-/* this is normally defined in Mesa/configs/default with DRI_DRIVER_SEARCH_PATH */
-#define DEFAULT_DRIVER_DIR "/usr/local/lib/dri"
-#endif
-
#endif /* __CL_DRIVER_H__ */
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index ea4e90a..18ab473 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -33,6 +33,9 @@ LOCAL cl_driver_update_device_info_cb *cl_driver_update_device_info = NULL;
/* Buffer */
LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
LOCAL cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr = NULL;
+LOCAL cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset = NULL;
+LOCAL cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range = NULL;
+LOCAL cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse = NULL;
LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
@@ -57,13 +60,6 @@ LOCAL cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align = NULL;
LOCAL cl_buffer_get_buffer_from_fd_cb *cl_buffer_get_buffer_from_fd = NULL;
LOCAL cl_buffer_get_image_from_fd_cb *cl_buffer_get_image_from_fd = NULL;
-/* cl_khr_gl_sharing */
-LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
-LOCAL cl_gl_release_texture_cb *cl_gl_release_texture = NULL;
-LOCAL cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object = NULL;
-LOCAL cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object = NULL;
-LOCAL cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer = NULL;
-LOCAL cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer = NULL;
/* GPGPU */
LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
@@ -107,4 +103,6 @@ LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
+LOCAL cl_gpgpu_set_kernel_cb *cl_gpgpu_set_kernel = NULL;
+LOCAL cl_gpgpu_get_kernel_cb *cl_gpgpu_get_kernel = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 54c0ffa..8350089 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -16,93 +16,102 @@
*
* Author: Rong Yang <rong.r.yang at intel.com>
*/
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <pthread.h>
+//#include "cl_image.h"
#include "cl_enqueue.h"
-#include "cl_image.h"
#include "cl_driver.h"
#include "cl_event.h"
#include "cl_command_queue.h"
#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_device_enqueue.h"
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
-
-cl_int cl_enqueue_read_buffer(enqueue_data* data)
+static cl_int
+cl_enqueue_read_buffer(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
+
+ if (status != CL_COMPLETE)
+ return err;
+
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
- struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
//cl_buffer_get_subdata sometime is very very very slow in linux kernel, in skl and chv,
//and it is randomly. So temporary disable it, use map/copy/unmap to read.
//Should re-enable it after find root cause.
if (0 && !mem->is_userptr) {
if (cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
- data->size, data->ptr) != 0)
+ data->size, data->ptr) != 0)
err = CL_MAP_FAILURE;
} else {
- void* src_ptr = cl_mem_map_auto(mem, 0);
+ void *src_ptr = cl_mem_map_auto(mem, 0);
if (src_ptr == NULL)
err = CL_MAP_FAILURE;
else {
//sometimes, application invokes read buffer, instead of map buffer, even if userptr is enabled
//memcpy is not necessary for this case
- if (data->ptr != (char*)src_ptr + data->offset + buffer->sub_offset)
- memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
+ if (data->ptr != (char *)src_ptr + data->offset + buffer->sub_offset)
+ memcpy(data->ptr, (char *)src_ptr + data->offset + buffer->sub_offset, data->size);
cl_mem_unmap_auto(mem);
}
}
return err;
}
-cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
+static cl_int
+cl_enqueue_read_buffer_rect(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
- void* src_ptr;
- void* dst_ptr;
+ void *src_ptr;
+ void *dst_ptr;
- const size_t* origin = data->origin;
- const size_t* host_origin = data->host_origin;
- const size_t* region = data->region;
+ const size_t *origin = data->origin;
+ const size_t *host_origin = data->host_origin;
+ const size_t *region = data->region;
cl_mem mem = data->mem_obj;
+
+ if (status != CL_COMPLETE)
+ return err;
+
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
- struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
err = CL_MAP_FAILURE;
goto error;
}
- size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
- src_ptr = (char*)src_ptr + offset + buffer->sub_offset;
-
- offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
- dst_ptr = (char *)data->ptr + offset;
-
- if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
- (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
- {
- memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
- }
- else {
- cl_uint y, z;
- for (z = 0; z < region[2]; z++) {
- const char* src = src_ptr;
- char* dst = dst_ptr;
- for (y = 0; y < region[1]; y++) {
- memcpy(dst, src, region[0]);
- src += data->row_pitch;
- dst += data->host_row_pitch;
- }
- src_ptr = (char*)src_ptr + data->slice_pitch;
- dst_ptr = (char*)dst_ptr + data->host_slice_pitch;
- }
- }
+ size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2];
+ src_ptr = (char *)src_ptr + offset + buffer->sub_offset;
+
+ offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2];
+ dst_ptr = (char *)data->ptr + offset;
+
+ if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+ (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) {
+ memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
+ } else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char *src = src_ptr;
+ char *dst = dst_ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, region[0]);
+ src += data->row_pitch;
+ dst += data->host_row_pitch;
+ }
+ src_ptr = (char *)src_ptr + data->slice_pitch;
+ dst_ptr = (char *)dst_ptr + data->host_slice_pitch;
+ }
+ }
err = cl_mem_unmap_auto(mem);
@@ -110,75 +119,80 @@ error:
return err;
}
-cl_int cl_enqueue_write_buffer(enqueue_data *data)
+static cl_int
+cl_enqueue_write_buffer(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
- struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
+
+ if (status != CL_COMPLETE)
+ return err;
if (mem->is_userptr) {
- void* dst_ptr = cl_mem_map_auto(mem, 1);
+ void *dst_ptr = cl_mem_map_auto(mem, 1);
if (dst_ptr == NULL)
err = CL_MAP_FAILURE;
else {
- memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
+ memcpy((char *)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
cl_mem_unmap_auto(mem);
}
- }
- else {
+ } else {
if (cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
- data->size, data->const_ptr) != 0)
+ data->size, data->const_ptr) != 0)
err = CL_MAP_FAILURE;
}
return err;
}
-cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
+static cl_int
+cl_enqueue_write_buffer_rect(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
- void* src_ptr;
- void* dst_ptr;
+ void *src_ptr;
+ void *dst_ptr;
- const size_t* origin = data->origin;
- const size_t* host_origin = data->host_origin;
- const size_t* region = data->region;
+ const size_t *origin = data->origin;
+ const size_t *host_origin = data->host_origin;
+ const size_t *region = data->region;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
- struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
+
+ if (status != CL_COMPLETE)
+ return err;
if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
err = CL_MAP_FAILURE;
goto error;
}
- size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+ size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2];
dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset;
- offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
- src_ptr = (char*)data->const_ptr + offset;
+ offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2];
+ src_ptr = (char *)data->const_ptr + offset;
if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
- (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
- {
- memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
- }
- else {
+ (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) {
+ memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
+ } else {
cl_uint y, z;
for (z = 0; z < region[2]; z++) {
- const char* src = src_ptr;
- char* dst = dst_ptr;
+ const char *src = src_ptr;
+ char *dst = dst_ptr;
for (y = 0; y < region[1]; y++) {
memcpy(dst, src, region[0]);
src += data->host_row_pitch;
dst += data->row_pitch;
}
- src_ptr = (char*)src_ptr + data->host_slice_pitch;
- dst_ptr = (char*)dst_ptr + data->slice_pitch;
+ src_ptr = (char *)src_ptr + data->host_slice_pitch;
+ dst_ptr = (char *)dst_ptr + data->slice_pitch;
}
}
@@ -188,16 +202,19 @@ error:
return err;
}
-
-cl_int cl_enqueue_read_image(enqueue_data *data)
+static cl_int
+cl_enqueue_read_image(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
- void* src_ptr;
+ void *src_ptr;
cl_mem mem = data->mem_obj;
CHECK_IMAGE(mem, image);
- const size_t* origin = data->origin;
- const size_t* region = data->region;
+ const size_t *origin = data->origin;
+ const size_t *region = data->region;
+
+ if (status != CL_COMPLETE)
+ return err;
if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
err = CL_MAP_FAILURE;
@@ -208,40 +225,42 @@ cl_int cl_enqueue_read_image(enqueue_data *data)
src_ptr = (char*)src_ptr + offset;
if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
- (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
- {
- memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
- }
- else {
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch))) {
+ memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
+ } else {
cl_uint y, z;
for (z = 0; z < region[2]; z++) {
- const char* src = src_ptr;
- char* dst = data->ptr;
+ const char *src = src_ptr;
+ char *dst = data->ptr;
for (y = 0; y < region[1]; y++) {
- memcpy(dst, src, image->bpp*region[0]);
+ memcpy(dst, src, image->bpp * region[0]);
src += image->row_pitch;
dst += data->row_pitch;
}
- src_ptr = (char*)src_ptr + image->slice_pitch;
- data->ptr = (char*)data->ptr + data->slice_pitch;
+ src_ptr = (char *)src_ptr + image->slice_pitch;
+ data->ptr = (char *)data->ptr + data->slice_pitch;
}
}
- err = cl_mem_unmap_auto(mem);
+ err = cl_mem_unmap_auto(mem);
error:
return err;
-
}
-cl_int cl_enqueue_write_image(enqueue_data *data)
+static cl_int
+cl_enqueue_write_image(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
- void* dst_ptr;
+ void *dst_ptr;
cl_mem mem = data->mem_obj;
+
CHECK_IMAGE(mem, image);
+ if (status != CL_COMPLETE)
+ return err;
+
if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
err = CL_MAP_FAILURE;
goto error;
@@ -255,45 +274,58 @@ cl_int cl_enqueue_write_image(enqueue_data *data)
error:
return err;
-
}
-cl_int cl_enqueue_map_buffer(enqueue_data *data)
+static cl_int
+cl_enqueue_map_buffer(enqueue_data *data, cl_int status)
{
void *ptr = NULL;
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
- mem->type == CL_MEM_SUBBUFFER_TYPE);
- struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ mem->type == CL_MEM_SUBBUFFER_TYPE ||
+ mem->type == CL_MEM_SVM_TYPE);
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer *)mem;
- if (mem->is_userptr)
- ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
- else {
- if(data->unsync_map == 1)
- //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
- ptr = cl_mem_map_gtt(mem);
- else
+ if (status == CL_SUBMITTED) {
+ if (buffer->base.is_userptr) {
+ ptr = buffer->base.host_ptr;
+ } else {
+ if ((ptr = cl_mem_map_gtt_unsync(&buffer->base)) == NULL) {
+ err = CL_MAP_FAILURE;
+ return err;
+ }
+ }
+ data->ptr = ptr;
+ } else if (status == CL_COMPLETE) {
+ if (mem->is_userptr)
ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
- }
+ else {
+ if (data->unsync_map == 1)
+ //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+ ptr = cl_mem_map_gtt(mem);
+ else
+ ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
+ }
- if (ptr == NULL) {
- err = CL_MAP_FAILURE;
- goto error;
- }
- data->ptr = ptr;
+ if (ptr == NULL) {
+ err = CL_MAP_FAILURE;
+ return err;
+ }
+ data->ptr = ptr;
- if((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
- assert(mem->host_ptr);
- ptr = (char*)ptr + data->offset + buffer->sub_offset;
- memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
+ if ((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
+ assert(mem->host_ptr);
+ ptr = (char *)ptr + data->offset + buffer->sub_offset;
+ memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
+ }
}
-error:
return err;
}
-cl_int cl_enqueue_map_image(enqueue_data *data)
+static cl_int
+cl_enqueue_map_image(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
@@ -301,46 +333,59 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
size_t row_pitch = 0;
CHECK_IMAGE(mem, image);
- if(data->unsync_map == 1)
- //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
- ptr = cl_mem_map_gtt(mem);
- else
- ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
+ if (status == CL_SUBMITTED) {
+ if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+ data->ptr = ptr;
+ } else if (status == CL_COMPLETE) {
+ if (data->unsync_map == 1)
+ //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+ ptr = cl_mem_map_gtt(mem);
+ else
+ ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
- if (ptr == NULL) {
- err = CL_MAP_FAILURE;
- goto error;
- }
- data->ptr = (char*)ptr + image->offset;
- if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
- row_pitch = image->slice_pitch;
- else
- row_pitch = image->row_pitch;
-
- if(mem->flags & CL_MEM_USE_HOST_PTR) {
- assert(mem->host_ptr);
- if (!mem->is_userptr)
- //src and dst need add offset in function cl_mem_copy_image_region
- cl_mem_copy_image_region(data->origin, data->region,
- mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
- data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
+ if (ptr == NULL) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ data->ptr = (char*)ptr + image->offset;
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ row_pitch = image->slice_pitch;
+ else
+ row_pitch = image->row_pitch;
+
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mem->host_ptr);
+ if (!mem->is_userptr)
+ //src and dst need add offset in function cl_mem_copy_image_region
+ cl_mem_copy_image_region(data->origin, data->region,
+ mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+ data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
+ }
}
error:
return err;
}
-cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
+static cl_int
+cl_enqueue_unmap_mem_object(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
int i, j;
size_t mapped_size = 0;
size_t origin[3], region[3];
- void * v_ptr = NULL;
- void * mapped_ptr = data->ptr;
+ void *v_ptr = NULL;
+ void *mapped_ptr = data->ptr;
cl_mem memobj = data->mem_obj;
size_t row_pitch = 0;
+ if (status != CL_COMPLETE)
+ return err;
+
assert(memobj->mapped_ptr_sz >= memobj->map_ref);
INVALID_VALUE_IF(!mapped_ptr);
for (i = 0; i < memobj->mapped_ptr_sz; i++) {
@@ -348,7 +393,7 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
memobj->mapped_ptr[i].ptr = NULL;
mapped_size = memobj->mapped_ptr[i].size;
v_ptr = memobj->mapped_ptr[i].v_ptr;
- for(j=0; j<3; j++) {
+ for (j = 0; j < 3; j++) {
region[j] = memobj->mapped_ptr[i].region[j];
origin[j] = memobj->mapped_ptr[i].origin[j];
memobj->mapped_ptr[i].region[j] = 0;
@@ -364,10 +409,11 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
if (memobj->flags & CL_MEM_USE_HOST_PTR) {
- if(memobj->type == CL_MEM_BUFFER_TYPE ||
- memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+ if (memobj->type == CL_MEM_BUFFER_TYPE ||
+ memobj->type == CL_MEM_SUBBUFFER_TYPE ||
+ memobj->type == CL_MEM_SVM_TYPE) {
assert(mapped_ptr >= memobj->host_ptr &&
- mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+ mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
/* Sync the data. */
if (!memobj->is_userptr)
memcpy(v_ptr, mapped_ptr, mapped_size);
@@ -381,8 +427,8 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
if (!memobj->is_userptr)
//v_ptr have added offset, host_ptr have not added offset.
cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
- memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
- image, CL_FALSE, CL_TRUE);
+ memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+ image, CL_FALSE, CL_TRUE);
}
} else {
assert(v_ptr == mapped_ptr);
@@ -391,24 +437,24 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
cl_mem_unmap_auto(memobj);
/* shrink the mapped slot. */
- if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
+ if (memobj->mapped_ptr_sz / 2 > memobj->map_ref) {
int j = 0;
cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
+ sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz / 2));
if (!new_ptr) {
/* Just do nothing. */
goto error;
}
- memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
+ memset(new_ptr, 0, (memobj->mapped_ptr_sz / 2) * sizeof(cl_mapped_ptr));
for (i = 0; i < memobj->mapped_ptr_sz; i++) {
if (memobj->mapped_ptr[i].ptr) {
new_ptr[j] = memobj->mapped_ptr[i];
j++;
- assert(j < memobj->mapped_ptr_sz/2);
+ assert(j < memobj->mapped_ptr_sz / 2);
}
}
- memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
+ memobj->mapped_ptr_sz = memobj->mapped_ptr_sz / 2;
free(memobj->mapped_ptr);
memobj->mapped_ptr = new_ptr;
}
@@ -417,7 +463,8 @@ error:
return err;
}
-cl_int cl_enqueue_native_kernel(enqueue_data *data)
+static cl_int
+cl_enqueue_native_kernel(enqueue_data *data, cl_int status)
{
cl_int err = CL_SUCCESS;
cl_uint num_mem_objects = (cl_uint)data->offset;
@@ -425,65 +472,208 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data)
const void **args_mem_loc = (const void **)data->const_ptr;
cl_uint i;
- for (i=0; i<num_mem_objects; ++i)
- {
- const cl_mem buffer = mem_list[i];
- CHECK_MEM(buffer);
+ if (status != CL_COMPLETE)
+ return err;
+
+ for (i = 0; i < num_mem_objects; ++i) {
+ const cl_mem buffer = mem_list[i];
+ CHECK_MEM(buffer);
- *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
+ *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
}
data->user_func(data->ptr);
- for (i=0; i<num_mem_objects; ++i)
- {
- cl_mem_unmap_auto(mem_list[i]);
+ for (i = 0; i < num_mem_objects; ++i) {
+ cl_mem_unmap_auto(mem_list[i]);
}
- free(data->ptr);
error:
return err;
}
-cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
+cl_int cl_enqueue_svm_free(enqueue_data *data, cl_int status) {
+ int i;
+ void **pointers = data->pointers;
+ uint num_svm_ptrs = data->size;
+ cl_int err = CL_SUCCESS;
+
+ if (status != CL_COMPLETE)
+ return err;
+
+ if(data->free_func) {
+ data->free_func(data->queue, num_svm_ptrs, pointers, data->ptr);
+ } else {
+ for(i=0; i<num_svm_ptrs; i++)
+ cl_mem_svm_delete(data->queue->ctx, pointers[i]);
+ }
+
+ free(pointers);
+ return CL_SUCCESS;
+}
+
+cl_int cl_enqueue_svm_mem_copy(enqueue_data *data, cl_int status) {
+ cl_mem mem;
+ size_t size = data->size;
+ const char* src_ptr = (const char *)data->const_ptr;
+ char *dst_ptr = (char *)data->ptr;
+ cl_int err = CL_SUCCESS;
+ int i;
+
+ if (status != CL_COMPLETE)
+ return err;
+
+ if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->ptr)) != NULL) {
+ dst_ptr = (char *)cl_mem_map_auto(mem, 1);
+ }
+
+ if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->const_ptr)) != NULL) {
+ src_ptr = (const char *)cl_mem_map_auto(mem, 0);
+ }
+
+ for(i=0; i<size; i++) {
+ dst_ptr[i] = src_ptr[i];
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int cl_enqueue_svm_mem_fill(enqueue_data *data, cl_int status) {
+ cl_mem mem;
+ size_t size = data->size;
+ size_t pattern_size = data->pattern_size;
+ const char* pattern = (const char *)data->const_ptr;
+ char *ptr = (char *)data->ptr;
+ cl_int err = CL_SUCCESS;
+ int i, j;
+
+ if (status != CL_COMPLETE)
+ return err;
+
+ if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->ptr)) != NULL) {
+ ptr = (char *)cl_mem_map_auto(mem, 1);
+ }
+
+ for(i=0; i<size; ) {
+ for(j=0; j<pattern_size; j++) {
+ ptr[i++] = pattern[j];
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+static cl_int
+cl_enqueue_ndrange(enqueue_data *data, cl_int status)
+{
+ cl_int err = CL_SUCCESS;
+
+ if (status == CL_SUBMITTED) {
+ err = cl_command_queue_flush_gpgpu(data->gpgpu);
+ //if it is the last ndrange of an cl enqueue api,
+ //check the device enqueue information.
+ if (data->mid_event_of_enq == 0) {
+ assert(data->queue);
+ cl_device_enqueue_parse_result(data->queue, data->gpgpu);
+ }
+ } else if (status == CL_COMPLETE) {
+ void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu);
+ cl_gpgpu_sync(batch_buf);
+ cl_gpgpu_unref_batch_buf(batch_buf);
+ }
+
+ return err;
+}
+
+static cl_int
+cl_enqueue_marker_or_barrier(enqueue_data *data, cl_int status)
{
- /* if need profiling, add the submit timestamp here. */
- if (event && event->type != CL_COMMAND_USER
- && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
- cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
+ return CL_COMPLETE;
+}
+
+LOCAL void
+cl_enqueue_delete(enqueue_data *data)
+{
+ if (data == NULL)
+ return;
+
+ if (data->type == EnqueueCopyBufferRect ||
+ data->type == EnqueueCopyBuffer ||
+ data->type == EnqueueCopyImage ||
+ data->type == EnqueueCopyBufferToImage ||
+ data->type == EnqueueCopyImageToBuffer ||
+ data->type == EnqueueNDRangeKernel ||
+ data->type == EnqueueFillBuffer ||
+ data->type == EnqueueFillImage) {
+ if (data->gpgpu) {
+ cl_gpgpu_delete(data->gpgpu);
+ data->gpgpu = NULL;
+ }
+ return;
}
- switch(data->type) {
- case EnqueueReadBuffer:
- return cl_enqueue_read_buffer(data);
- case EnqueueReadBufferRect:
- return cl_enqueue_read_buffer_rect(data);
- case EnqueueWriteBuffer:
- return cl_enqueue_write_buffer(data);
- case EnqueueWriteBufferRect:
- return cl_enqueue_write_buffer_rect(data);
- case EnqueueReadImage:
- return cl_enqueue_read_image(data);
- case EnqueueWriteImage:
- return cl_enqueue_write_image(data);
- case EnqueueMapBuffer:
- return cl_enqueue_map_buffer(data);
- case EnqueueMapImage:
- return cl_enqueue_map_image(data);
- case EnqueueUnmapMemObject:
- return cl_enqueue_unmap_mem_object(data);
- case EnqueueCopyBufferRect:
- case EnqueueCopyBuffer:
- case EnqueueCopyImage:
- case EnqueueCopyBufferToImage:
- case EnqueueCopyImageToBuffer:
- case EnqueueNDRangeKernel:
- case EnqueueFillBuffer:
- case EnqueueFillImage:
- return cl_event_flush(event);
- case EnqueueNativeKernel:
- return cl_enqueue_native_kernel(data);
- case EnqueueMigrateMemObj:
- default:
- return CL_SUCCESS;
+ if (data->type == EnqueueNativeKernel) {
+ if (data->mem_list) {
+ cl_free((void*)data->mem_list);
+ data->mem_list = NULL;
+ }
+ if (data->ptr) {
+ cl_free((void*)data->ptr);
+ data->ptr = NULL;
+ }
+ if (data->const_ptr) {
+ cl_free((void*)data->const_ptr);
+ data->const_ptr = NULL;
+ }
+ }
+}
+
+LOCAL cl_int
+cl_enqueue_handle(enqueue_data *data, cl_int status)
+{
+ switch (data->type) {
+ case EnqueueReturnSuccesss:
+ return CL_SUCCESS;
+ case EnqueueReadBuffer:
+ return cl_enqueue_read_buffer(data, status);
+ case EnqueueReadBufferRect:
+ return cl_enqueue_read_buffer_rect(data, status);
+ case EnqueueWriteBuffer:
+ return cl_enqueue_write_buffer(data, status);
+ case EnqueueWriteBufferRect:
+ return cl_enqueue_write_buffer_rect(data, status);
+ case EnqueueReadImage:
+ return cl_enqueue_read_image(data, status);
+ case EnqueueWriteImage:
+ return cl_enqueue_write_image(data, status);
+ case EnqueueMapBuffer:
+ return cl_enqueue_map_buffer(data, status);
+ case EnqueueMapImage:
+ return cl_enqueue_map_image(data, status);
+ case EnqueueUnmapMemObject:
+ return cl_enqueue_unmap_mem_object(data, status);
+ case EnqueueSVMFree:
+ return cl_enqueue_svm_free(data, status);
+ case EnqueueSVMMemCopy:
+ return cl_enqueue_svm_mem_copy(data, status);
+ case EnqueueSVMMemFill:
+ return cl_enqueue_svm_mem_fill(data, status);
+ case EnqueueMarker:
+ case EnqueueBarrier:
+ return cl_enqueue_marker_or_barrier(data, status);
+ case EnqueueCopyBufferRect:
+ case EnqueueCopyBuffer:
+ case EnqueueCopyImage:
+ case EnqueueCopyBufferToImage:
+ case EnqueueCopyImageToBuffer:
+ case EnqueueNDRangeKernel:
+ case EnqueueFillBuffer:
+ case EnqueueFillImage:
+ //return cl_event_flush(event);
+ return cl_enqueue_ndrange(data, status);
+ case EnqueueNativeKernel:
+ return cl_enqueue_native_kernel(data, status);
+ case EnqueueMigrateMemObj:
+ default:
+ return CL_SUCCESS;
}
}
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 09305af..50a54fc 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -24,7 +24,8 @@
#include "CL/cl.h"
typedef enum {
- EnqueueReadBuffer = 0,
+ EnqueueReturnSuccesss = 0, /* For some case, we have nothing to do, just return SUCCESS. */
+ EnqueueReadBuffer,
EnqueueReadBufferRect,
EnqueueWriteBuffer,
EnqueueWriteBufferRect,
@@ -45,30 +46,45 @@ typedef enum {
EnqueueFillBuffer,
EnqueueFillImage,
EnqueueMigrateMemObj,
+ EnqueueSVMFree,
+ EnqueueSVMMemCopy,
+ EnqueueSVMMemFill,
EnqueueInvalid
} enqueue_type;
typedef struct _enqueue_data {
- enqueue_type type; /* Command type */
- cl_mem mem_obj; /* Enqueue's cl_mem */
- cl_command_queue queue; /* Command queue */
- size_t offset; /* Mem object's offset */
- size_t size; /* Size */
- size_t origin[3]; /* Origin */
- size_t host_origin[3]; /* Origin */
- size_t region[3]; /* Region */
- size_t row_pitch; /* Row pitch */
- size_t slice_pitch; /* Slice pitch */
- size_t host_row_pitch; /* Host row pitch, used in read/write buffer rect */
- size_t host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */
- const void * const_ptr; /* Const ptr for memory read */
- void * ptr; /* Ptr for write and return value */
- const cl_mem* mem_list; /* mem_list of clEnqueueNativeKernel */
- uint8_t unsync_map; /* Indicate the clEnqueueMapBuffer/Image is unsync map */
- uint8_t write_map; /* Indicate if the clEnqueueMapBuffer is write enable */
- void (*user_func)(void *); /* pointer to a host-callable user function */
+ enqueue_type type; /* Command type */
+ cl_mem mem_obj; /* Enqueue's cl_mem */
+ cl_command_queue queue; /* Command queue */
+ size_t offset; /* Mem object's offset */
+ size_t size; /* Size */
+ size_t origin[3]; /* Origin */
+ size_t host_origin[3]; /* Origin */
+ size_t region[3]; /* Region */
+ size_t row_pitch; /* Row pitch */
+ size_t slice_pitch; /* Slice pitch */
+ size_t host_row_pitch; /* Host row pitch, used in read/write buffer rect */
+ size_t host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */
+ const void *const_ptr; /* Const ptr for memory read */
+ void *ptr; /* Ptr for write and return value */
+ const cl_mem *mem_list; /* mem_list of clEnqueueNativeKernel */
+ uint8_t unsync_map; /* Indicate the clEnqueueMapBuffer/Image is unsync map */
+ uint8_t write_map; /* Indicate if the clEnqueueMapBuffer is write enable */
+ void ** pointers; /* The svm_pointers of clEnqueueSVMFree */
+ size_t pattern_size; /* the pattern_size of clEnqueueSVMMemFill */
+ void (*user_func)(void *); /* pointer to a host-callable user function */
+ void (CL_CALLBACK *free_func)( cl_command_queue queue,
+ cl_uint num_svm_pointers,
+ void *svm_pointers[],
+ void *user_data); /* pointer to pfn_free_func of clEnqueueSVMFree */
+ cl_gpgpu gpgpu;
+ cl_bool mid_event_of_enq; /* For non-uniform ndrange, one enqueue have a sequence event, the
+ last event need to parse device enqueue information.
+ 0 : last event; 1: non-last event */
} enqueue_data;
/* Do real enqueue commands */
-cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
+extern cl_int cl_enqueue_handle(enqueue_data *data, cl_int status);
+extern void cl_enqueue_delete(enqueue_data *data);
+
#endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index a2aacea..3e1dc22 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -14,769 +14,684 @@
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <http://www.gnu.org/licenses/>.
*
- * Author: Rong Yang <rong.r.yang at intel.com>
*/
#include "cl_event.h"
#include "cl_context.h"
-#include "cl_utils.h"
-#include "cl_alloc.h"
-#include "cl_khr_icd.h"
-#include "cl_kernel.h"
#include "cl_command_queue.h"
-
-#include <assert.h>
+#include "cl_alloc.h"
+#include <string.h>
#include <stdio.h>
-void cl_event_update_last_events(cl_command_queue queue, int wait)
+// TODO: Need to move it to some device related file later.
+static void
+cl_event_update_timestamp_gen(cl_event event, cl_int status)
{
- cl_event last_event = get_last_event(queue);
- if(!last_event) return;
- cl_event next, now;
- now = last_event;
- while(now){
- next = now->last_next;//get next first in case set status maintain it
- cl_event_update_status(now,wait);//update event status
- now = next;
+ cl_ulong ts = 0;
+
+ if ((event->exec_data.type == EnqueueCopyBufferRect) ||
+ (event->exec_data.type == EnqueueCopyBuffer) ||
+ (event->exec_data.type == EnqueueCopyImage) ||
+ (event->exec_data.type == EnqueueCopyBufferToImage) ||
+ (event->exec_data.type == EnqueueCopyImageToBuffer) ||
+ (event->exec_data.type == EnqueueNDRangeKernel) ||
+ (event->exec_data.type == EnqueueFillBuffer) ||
+ (event->exec_data.type == EnqueueFillImage)) {
+
+ if (status == CL_QUEUED || status == CL_SUBMITTED) {
+ cl_gpgpu_event_get_gpu_cur_timestamp(event->queue->ctx->drv, &ts);
+
+ if (ts == CL_EVENT_INVALID_TIMESTAMP)
+ ts++;
+ event->timestamp[CL_QUEUED - status] = ts;
+ return;
+ } else if (status == CL_RUNNING) {
+ assert(event->exec_data.gpgpu);
+ return; // Wait for the event complete and get run and complete then.
+ } else {
+ assert(event->exec_data.gpgpu);
+ cl_gpgpu_event_get_exec_timestamp(event->exec_data.gpgpu, 0, &ts);
+ if (ts == CL_EVENT_INVALID_TIMESTAMP)
+ ts++;
+ event->timestamp[2] = ts;
+ cl_gpgpu_event_get_exec_timestamp(event->exec_data.gpgpu, 1, &ts);
+ if (ts == CL_EVENT_INVALID_TIMESTAMP)
+ ts++;
+ event->timestamp[3] = ts;
+
+ /* Set the submit time the same as running time if it is later. */
+ if (event->timestamp[1] > event->timestamp[2] ||
+ event->timestamp[2] - event->timestamp[1] > 0x0FFFFFFFFFF /*Overflowed */)
+ event->timestamp[1] = event->timestamp[2];
+
+ return;
+ }
+ } else {
+ cl_gpgpu_event_get_gpu_cur_timestamp(event->queue->ctx->drv, &ts);
+ if (ts == CL_EVENT_INVALID_TIMESTAMP)
+ ts++;
+ event->timestamp[CL_QUEUED - status] = ts;
+ return;
}
}
-void cl_event_insert_last_events(cl_command_queue queue,cl_event event)
+LOCAL void
+cl_event_update_timestamp(cl_event event, cl_int state)
{
- if(!event) return;
- cl_event last_event = get_last_event(queue);
- if(last_event){
- cl_event now = last_event;
- while(now->last_next)
- now = now->last_next;
- now->last_next = event;
- event->last_prev = now;
+ int i;
+ cl_bool re_cal = CL_FALSE;
+ cl_ulong ts[4];
+
+ assert(state >= CL_COMPLETE || state <= CL_QUEUED);
+
+ if (event->event_type == CL_COMMAND_USER)
+ return;
+
+ assert(event->queue);
+ if ((event->queue->props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ return;
+
+ /* Should not record the timestamp twice. */
+ assert(event->timestamp[CL_QUEUED - state] == CL_EVENT_INVALID_TIMESTAMP);
+ cl_event_update_timestamp_gen(event, state);
+
+ if (state == CL_COMPLETE) {
+ // TODO: Need to set the CL_PROFILING_COMMAND_COMPLETE when enable child enqueue.
+ // Just a duplicate of event complete time now.
+ event->timestamp[4] = event->timestamp[3];
+
+ /* If timestamp overflow, set queued time to 0 and re-calculate. */
+ for (i = 0; i < 4; i++) {
+ if (event->timestamp[i + 1] < event->timestamp[i]) {
+ re_cal = CL_TRUE;
+ break;
+ }
+ }
+
+ if (re_cal) {
+ for (i = 3; i >= 0; i--) {
+ if (event->timestamp[i + 1] < event->timestamp[i]) { //overflow
+ ts[i] = event->timestamp[i + 1] + (CL_EVENT_INVALID_TIMESTAMP - event->timestamp[i]);
+ } else {
+ ts[i] = event->timestamp[i + 1] - event->timestamp[i];
+ }
+ }
+
+ event->timestamp[0] = 0;
+ for (i = 1; i < 5; i++) {
+ event->timestamp[i] = event->timestamp[i - 1] + ts[i - 1];
+ }
+ }
}
- else set_last_event(queue,event);
}
-static inline cl_bool
-cl_event_is_gpu_command_type(cl_command_type type)
+LOCAL void
+cl_event_add_ref(cl_event event)
{
- switch(type) {
- case CL_COMMAND_COPY_BUFFER:
- case CL_COMMAND_FILL_BUFFER:
- case CL_COMMAND_COPY_IMAGE:
- case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
- case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
- case CL_COMMAND_COPY_BUFFER_RECT:
- case CL_COMMAND_TASK:
- case CL_COMMAND_NDRANGE_KERNEL:
- return CL_TRUE;
- default:
- return CL_FALSE;
- }
+ assert(event);
+ CL_OBJECT_INC_REF(event);
}
-int cl_event_flush(cl_event event)
+LOCAL cl_int
+cl_event_get_status(cl_event event)
{
- int err = CL_SUCCESS;
- if(!event) {
- err = CL_INVALID_VALUE;
- return err;
- }
+ cl_int ret;
- assert(event->gpgpu_event != NULL);
- if (event->gpgpu) {
- err = cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
- cl_gpgpu_delete(event->gpgpu);
- event->gpgpu = NULL;
- }
- cl_gpgpu_event_flush(event->gpgpu_event);
- cl_event_insert_last_events(event->queue,event);
- return err;
+ assert(event);
+ CL_OBJECT_LOCK(event);
+ ret = event->status;
+ CL_OBJECT_UNLOCK(event);
+ return ret;
}
-cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
+static cl_event
+cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type,
+ cl_uint num_events, cl_event *event_list)
{
- cl_event event = NULL;
- GET_QUEUE_THREAD_GPGPU(queue);
+ int i;
+ cl_event e = cl_calloc(1, sizeof(_cl_event));
+ if (e == NULL)
+ return NULL;
- /* Allocate and inialize the structure itself */
- TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
- SET_ICD(event->dispatch)
- event->magic = CL_MAGIC_EVENT_HEADER;
- event->ref_n = 1;
+ CL_OBJECT_INIT_BASE(e, CL_OBJECT_EVENT_MAGIC);
/* Append the event in the context event list */
- pthread_mutex_lock(&ctx->event_lock);
- event->next = ctx->events;
- if (ctx->events != NULL)
- ctx->events->prev = event;
- ctx->events = event;
- pthread_mutex_unlock(&ctx->event_lock);
- event->ctx = ctx;
- cl_context_add_ref(ctx);
-
- /* Initialize all members and create GPGPU event object */
- event->queue = queue;
- event->type = type;
- event->gpgpu_event = NULL;
- if(type == CL_COMMAND_USER) {
- event->status = CL_SUBMITTED;
+ cl_context_add_event(ctx, e);
+ e->queue = queue;
+
+ list_init(&e->callbacks);
+ list_node_init(&e->enqueue_node);
+
+ assert(type >= CL_COMMAND_NDRANGE_KERNEL && type <= CL_COMMAND_SVM_UNMAP);
+ e->event_type = type;
+ if (type == CL_COMMAND_USER) {
+ e->status = CL_SUBMITTED;
+ } else {
+ e->status = CL_EVENT_STATE_UNKNOWN;
}
- else {
- event->status = CL_QUEUED;
- if(cl_event_is_gpu_command_type(event->type))
- event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
+
+ if (type == CL_COMMAND_USER) {
+ assert(queue == NULL);
+ }
+
+ e->depend_events = event_list;
+ e->depend_event_num = num_events;
+ for (i = 0; i < 4; i++) {
+ e->timestamp[i] = CL_EVENT_INVALID_TIMESTAMP;
}
- cl_event_add_ref(event); //dec when complete
- event->user_cb = NULL;
- event->enqueue_cb = NULL;
- event->waits_head = NULL;
- event->emplict = emplict;
-
-exit:
- return event;
-error:
- cl_event_delete(event);
- event = NULL;
- goto exit;
+
+ return e;
}
-void cl_event_delete(cl_event event)
+LOCAL void
+cl_event_delete(cl_event event)
{
+ int i;
+ cl_event_user_callback cb;
+
if (UNLIKELY(event == NULL))
return;
- cl_event_update_status(event, 0);
-
- if (atomic_dec(&event->ref_n) > 1)
+ if (CL_OBJECT_DEC_REF(event) > 1)
return;
- /* Call all user's callback if haven't execute */
- cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE status will force all callbacks that are not executed to run
+ cl_enqueue_delete(&event->exec_data);
- /* delete gpgpu event object */
- if(event->gpgpu_event)
- cl_gpgpu_event_delete(event->gpgpu_event);
+ assert(list_node_out_of_list(&event->enqueue_node));
+
+ if (event->depend_events) {
+ assert(event->depend_event_num);
+ for (i = 0; i < event->depend_event_num; i++) {
+ cl_event_delete(event->depend_events[i]);
+ }
+ cl_free(event->depend_events);
+ }
+
+ /* Free all the callbacks. Last ref, no need to lock. */
+ while (!list_empty(&event->callbacks)) {
+ cb = list_entry(event->callbacks.head_node.n, _cl_event_user_callback, node);
+ list_node_del(&cb->node);
+ cl_free(cb);
+ }
/* Remove it from the list */
assert(event->ctx);
- pthread_mutex_lock(&event->ctx->event_lock);
-
- if (event->prev)
- event->prev->next = event->next;
- if (event->next)
- event->next->prev = event->prev;
- /* if this is the head, update head pointer ctx->events */
- if (event->ctx->events == event)
- event->ctx->events = event->next;
-
- pthread_mutex_unlock(&event->ctx->event_lock);
- cl_context_delete(event->ctx);
-
- if (event->gpgpu) {
- fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n");
- cl_gpgpu_delete(event->gpgpu);
- event->gpgpu = NULL;
- }
+ cl_context_remove_event(event->ctx, event);
+
+ CL_OBJECT_DESTROY_BASE(event);
cl_free(event);
}
-void cl_event_add_ref(cl_event event)
+LOCAL cl_event
+cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint num_events,
+ const cl_event *event_list, cl_command_type type, cl_int *errcode_ret)
{
- assert(event);
- atomic_inc(&event->ref_n);
-}
+ cl_event e = NULL;
+ cl_event *depend_events = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_uint total_events = 0;
+ int i;
-cl_int cl_event_set_callback(cl_event event ,
- cl_int command_exec_callback_type,
- EVENT_NOTIFY pfn_notify,
- void* user_data)
-{
- assert(event);
- assert(pfn_notify);
+ assert(ctx);
- cl_int err = CL_SUCCESS;
- user_callback *cb;
- TRY_ALLOC(cb, CALLOC(user_callback));
-
- cb->pfn_notify = pfn_notify;
- cb->user_data = user_data;
- cb->status = command_exec_callback_type;
- cb->executed = CL_FALSE;
-
-
- // It is possible that the event enqueued is already completed.
- // clEnqueueReadBuffer can be synchronous and when the callback
- // is registered after, it still needs to get executed.
- pthread_mutex_lock(&event->ctx->event_lock); // Thread safety required: operations on the event->status can be made from many different threads
- if(event->status <= command_exec_callback_type) {
- /* Call user callback */
- pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify can call clFunctions that use the event_lock and from here it's not required
- cb->pfn_notify(event, event->status, cb->user_data);
- cl_free(cb);
- } else {
- // Enqueue to callback list
- cb->next = event->user_cb;
- event->user_cb = cb;
- pthread_mutex_unlock(&event->ctx->event_lock);
- }
+ do {
+ if (event_list)
+ assert(num_events);
-exit:
- return err;
-error:
- err = CL_OUT_OF_HOST_MEMORY;
- cl_free(cb);
- goto exit;
-};
-
-cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event *event,cl_context ctx)
-{
- cl_int err = CL_SUCCESS;
- cl_int i;
- /* check the event_wait_list and num_events_in_wait_list */
- if((event_wait_list == NULL) &&
- (num_events_in_wait_list > 0))
- goto error;
-
- if ((event_wait_list != NULL) &&
- (num_events_in_wait_list == 0)){
- goto error;
- }
+ if (queue == NULL) {
+ assert(type == CL_COMMAND_USER);
+ assert(event_list == NULL);
+ assert(num_events == 0);
- /* check the event and context */
- for(i=0; i<num_events_in_wait_list; i++) {
- CHECK_EVENT(event_wait_list[i]);
- if(event_wait_list[i]->status < CL_COMPLETE) {
- err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
- goto exit;
+ e = cl_event_new(ctx, queue, type, 0, NULL);
+ if (e == NULL) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ break;
+ }
+ } else {
+ CL_OBJECT_LOCK(queue);
+ total_events = queue->barrier_events_num + num_events;
+
+ if (total_events) {
+ depend_events = cl_calloc(total_events, sizeof(cl_event));
+ if (depend_events == NULL) {
+ CL_OBJECT_UNLOCK(queue);
+ err = CL_OUT_OF_HOST_MEMORY;
+ break;
+ }
+ }
+
+ /* Add all the barrier events as depend events. */
+ for (i = 0; i < queue->barrier_events_num; i++) {
+ assert(CL_EVENT_IS_BARRIER(queue->barrier_events[i]));
+ cl_event_add_ref(queue->barrier_events[i]);
+ depend_events[num_events + i] = queue->barrier_events[i];
+ }
+
+ CL_OBJECT_UNLOCK(queue);
+
+ for (i = 0; i < num_events; i++) {
+ assert(event_list && event_list[i]);
+ assert(event_list[i]->ctx == ctx);
+ assert(CL_OBJECT_IS_EVENT(event_list[i]));
+ cl_event_add_ref(event_list[i]);
+ depend_events[i] = event_list[i];
+ }
+
+ if (depend_events)
+ assert(total_events);
+
+ e = cl_event_new(ctx, queue, type, total_events, depend_events);
+ if (e == NULL) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ break;
+ }
+ depend_events = NULL;
}
- if(event && event == &event_wait_list[i])
- goto error;
- if(event_wait_list[i]->ctx != ctx) {
- err = CL_INVALID_CONTEXT;
- goto exit;
+ } while (0);
+
+ if (err != CL_SUCCESS) {
+ if (depend_events) {
+ for (i = 0; i < total_events; i++) {
+ cl_event_delete(depend_events[i]);
+ }
+ cl_free(depend_events);
}
+
+ // if set depend_events, must succeed.
+ assert(e->depend_events == NULL);
+ cl_event_delete(e);
}
-exit:
- return err;
-error:
- err = CL_INVALID_EVENT_WAIT_LIST; //reset error
- goto exit;
+ if (errcode_ret)
+ *errcode_ret = err;
+
+ return e;
}
-cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
- cl_command_queue queue)
+LOCAL cl_int
+cl_event_set_callback(cl_event event, cl_int exec_type, cl_event_notify_cb pfn_notify, void *user_data)
{
- cl_int i;
+ cl_int err = CL_SUCCESS;
+ cl_event_user_callback cb;
+ cl_bool exec_imm = CL_FALSE;
+
+ assert(event);
+ assert(pfn_notify);
- /* Check whether wait user events */
- for(i=0; i<num_events_in_wait_list; i++) {
- if(event_wait_list[i]->status <= CL_COMPLETE)
- continue;
+ do {
+ cb = cl_calloc(1, sizeof(_cl_event_user_callback));
+ if (cb == NULL) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ break;
+ }
- /* Need wait on user event, return and do enqueue defer */
- if((event_wait_list[i]->type == CL_COMMAND_USER) ||
- (event_wait_list[i]->enqueue_cb &&
- (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
- return CL_ENQUEUE_EXECUTE_DEFER;
+ list_node_init(&cb->node);
+ cb->pfn_notify = pfn_notify;
+ cb->user_data = user_data;
+ cb->status = exec_type;
+ cb->executed = CL_FALSE;
+
+ CL_OBJECT_LOCK(event);
+ if (event->status > exec_type) {
+ list_add_tail(&event->callbacks, &cb->node);
+ cb = NULL;
+ } else {
+ /* The state has already OK, call it immediately. */
+ exec_imm = CL_TRUE;
}
- }
+ CL_OBJECT_UNLOCK(event);
- if(queue && queue->barrier_events_num )
- return CL_ENQUEUE_EXECUTE_DEFER;
+ if (exec_imm) {
+ cb->pfn_notify(event, event->status, cb->user_data);
+ }
- /* Non user events or all user event finished, wait all enqueue events finish */
- for(i=0; i<num_events_in_wait_list; i++) {
- if(event_wait_list[i]->status <= CL_COMPLETE)
- continue;
+ } while (0);
- //enqueue callback haven't finish, in another thread, wait
- if(event_wait_list[i]->enqueue_cb != NULL)
- return CL_ENQUEUE_EXECUTE_DEFER;
- if(event_wait_list[i]->gpgpu_event)
- cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
- cl_event_set_status(event_wait_list[i], CL_COMPLETE); //Execute user's callback
- }
- return CL_ENQUEUE_EXECUTE_IMM;
+ if (cb)
+ cl_free(cb);
+
+ return err;
}
-void cl_event_new_enqueue_callback(cl_event event,
- enqueue_data *data,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list)
+LOCAL cl_int
+cl_event_set_status(cl_event event, cl_int status)
{
- enqueue_callback *cb, *node;
- user_event *user_events, *u_ev;
- cl_command_queue queue = event ? event->queue : NULL;
- cl_int i;
- cl_int err = CL_SUCCESS;
+ list_head tmp_callbacks;
+ list_node *n;
+ list_node *pos;
+ cl_bool notify_queue = CL_FALSE;
+ cl_event_user_callback cb;
- /* Allocate and initialize the structure itself */
- TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
- cb->num_events = 0;
- TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list));
- for(i=0; i<num_events_in_wait_list; i++) {
- //user event will insert to cb->wait_user_events, need not in wait list, avoid ref twice
- if(event_wait_list[i]->type != CL_COMMAND_USER) {
- cb->wait_list[cb->num_events++] = event_wait_list[i];
- cl_event_add_ref(event_wait_list[i]); //add defer enqueue's wait event reference
- }
+ assert(event);
+
+ CL_OBJECT_LOCK(event);
+ if (event->status <= CL_COMPLETE) { // Already set to error or completed
+ CL_OBJECT_UNLOCK(event);
+ return CL_INVALID_OPERATION;
}
- cb->event = event;
- cb->next = NULL;
- cb->wait_user_events = NULL;
-
- if(queue && queue->barrier_events_num > 0) {
- for(i=0; i<queue->barrier_events_num; i++) {
- /* Insert the enqueue_callback to user event list */
- node = queue->wait_events[i]->waits_head;
- if(node == NULL)
- queue->wait_events[i]->waits_head = cb;
- else{
- while((node != cb) && node->next)
- node = node->next;
- if(node == cb) //wait on dup user event
- continue;
- node->next = cb;
- }
- /* Insert the user event to enqueue_callback's wait_user_events */
- TRY(cl_event_insert_user_event, &cb->wait_user_events, queue->wait_events[i]);
- cl_event_add_ref(queue->wait_events[i]);
- }
+ if (CL_EVENT_IS_USER(event)) {
+ assert(event->status != CL_RUNNING && event->status != CL_QUEUED);
+ } else {
+ assert(event->queue); // Must belong to some queue.
}
- /* Find out all user events that in event_wait_list wait */
- for(i=0; i<num_events_in_wait_list; i++) {
- if(event_wait_list[i]->status <= CL_COMPLETE)
- continue;
-
- if(event_wait_list[i]->type == CL_COMMAND_USER) {
- /* Insert the enqueue_callback to user event list */
- node = event_wait_list[i]->waits_head;
- if(node == NULL)
- event_wait_list[i]->waits_head = cb;
- else {
- while((node != cb) && node->next)
- node = node->next;
- if(node == cb) //wait on dup user event
+ if (status >= event->status) { // Should never go back.
+ CL_OBJECT_UNLOCK(event);
+ return CL_INVALID_OPERATION;
+ }
+
+ event->status = status;
+
+ /* Call all the callbacks. */
+ if (!list_empty(&event->callbacks)) {
+ do {
+ status = event->status;
+ list_init(&tmp_callbacks);
+ list_move(&event->callbacks, &tmp_callbacks);
+ /* Call all the callbacks without lock. */
+ CL_OBJECT_UNLOCK(event);
+
+ list_for_each_safe(pos, n, &tmp_callbacks)
+ {
+ cb = list_entry(pos, _cl_event_user_callback, node);
+
+ assert(cb->executed == CL_FALSE);
+
+ if (cb->status < status)
continue;
- node->next = cb;
- }
- /* Insert the user event to enqueue_callback's wait_user_events */
- TRY(cl_event_insert_user_event, &cb->wait_user_events, event_wait_list[i]);
- cl_event_add_ref(event_wait_list[i]);
- if(queue)
- cl_command_queue_insert_event(queue, event_wait_list[i]);
- if(queue && data->type == EnqueueBarrier){
- cl_command_queue_insert_barrier_event(queue, event_wait_list[i]);
- }
- } else if(event_wait_list[i]->enqueue_cb != NULL) {
- user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
- while(user_events != NULL) {
- /* Insert the enqueue_callback to user event's waits_tail */
- node = user_events->event->waits_head;
- if(node == NULL)
- event_wait_list[i]->waits_head = cb;
- else{
- while((node != cb) && node->next)
- node = node->next;
- if(node == cb) { //wait on dup user event
- user_events = user_events->next;
- continue;
- }
- node->next = cb;
- }
- /* Insert the user event to enqueue_callback's wait_user_events */
- TRY(cl_event_insert_user_event, &cb->wait_user_events, user_events->event);
- cl_event_add_ref(user_events->event);
- if(queue)
- cl_command_queue_insert_event(event->queue, user_events->event);
- if(queue && data->type == EnqueueBarrier){
- cl_command_queue_insert_barrier_event(event->queue, user_events->event);
- }
- user_events = user_events->next;
+ list_node_del(&cb->node);
+ cb->executed = CL_TRUE;
+ cb->pfn_notify(event, status, cb->user_data);
+ cl_free(cb);
}
- }
- }
- if(event != NULL && event->queue != NULL && event->gpgpu_event != NULL) {
- event->gpgpu = cl_thread_gpgpu_take(event->queue);
- data->ptr = (void *)event->gpgpu_event;
- }
- cb->data = *data;
- if(event)
- event->enqueue_cb = cb;
-
-exit:
- return;
-error:
- if(cb) {
- while(cb->wait_user_events) {
- u_ev = cb->wait_user_events;
- cb->wait_user_events = cb->wait_user_events->next;
- cl_event_delete(u_ev->event);
- cl_free(u_ev);
- }
- for(i=0; i<cb->num_events; i++) {
- if(cb->wait_list[i]) {
- cl_event_delete(cb->wait_list[i]);
- }
- }
- cl_free(cb);
- }
- goto exit;
-}
-void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb) {
- user_callback *user_cb = NULL;
- user_callback *queue_cb = NULL; // For thread safety, we create a queue that holds user_callback's pfn_notify contents
- user_callback *temp_cb = NULL;
- user_cb = event->user_cb;
- pthread_mutex_lock(&event->ctx->event_lock);
- while(user_cb) {
- if(user_cb->status >= status
- && user_cb->executed == CL_FALSE) { // Added check to not execute a callback when it was already handled
- user_cb->executed = CL_TRUE;
- temp_cb = cl_malloc(sizeof(user_callback));
- if(!temp_cb) {
- break; // Out of memory
- }
- temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to call ppfn_notify out of the pthread_mutex
- temp_cb->user_data = user_cb->user_data;
- if(free_cb) {
- cl_free(user_cb);
- }
- if(!queue_cb) {
- queue_cb = temp_cb;
- queue_cb->next = NULL;
- } else { // Enqueue First
- temp_cb->next = queue_cb;
- queue_cb = temp_cb;
- }
- }
- user_cb = user_cb->next;
- }
- pthread_mutex_unlock(&event->ctx->event_lock);
-
- // Calling the callbacks outside of the event_lock is required because the callback can call cl_api functions and get deadlocked
- while(queue_cb) { // For each callback queued, actually execute the callback
- queue_cb->pfn_notify(event, event->status, queue_cb->user_data);
- temp_cb = queue_cb;
- queue_cb = queue_cb->next;
- cl_free(temp_cb);
- }
-}
+ CL_OBJECT_LOCK(event);
-void cl_event_set_status(cl_event event, cl_int status)
-{
- cl_int ret, i;
- cl_event evt;
+ // Set back the uncalled callbacks.
+ list_merge(&event->callbacks, &tmp_callbacks);
- pthread_mutex_lock(&event->ctx->event_lock);
- if(status >= event->status) {
- pthread_mutex_unlock(&event->ctx->event_lock);
- return;
- }
- if(event->status <= CL_COMPLETE) {
- event->status = status; //have done enqueue before or doing in another thread
- pthread_mutex_unlock(&event->ctx->event_lock);
- return;
+ /* Status may changed because we unlock. need to check again. */
+ } while (status != event->status);
}
- if(status <= CL_COMPLETE) {
- if(event->enqueue_cb) {
- if(status == CL_COMPLETE) {
- cl_enqueue_handle(event, &event->enqueue_cb->data);
- if(event->gpgpu_event)
- cl_gpgpu_event_update_status(event->gpgpu_event, 1); //now set complet, need refine
- } else {
- if(event->gpgpu_event) {
- // Error then cancel the enqueued event.
- cl_gpgpu_delete(event->gpgpu);
- event->gpgpu = NULL;
- }
- }
+ /* Wakeup all the waiter for status change. */
+ CL_OBJECT_NOTIFY_COND(event);
+
+ if (event->status <= CL_COMPLETE) {
+ notify_queue = CL_TRUE;
+ }
- event->status = status; //Change the event status after enqueue and befor unlock
+ CL_OBJECT_UNLOCK(event);
- pthread_mutex_unlock(&event->ctx->event_lock);
- for(i=0; i<event->enqueue_cb->num_events; i++)
- cl_event_delete(event->enqueue_cb->wait_list[i]);
- pthread_mutex_lock(&event->ctx->event_lock);
+ /* Need to notify all the command queue within the same context. */
+ if (notify_queue) {
+ cl_command_queue queue = NULL;
- if(event->enqueue_cb->wait_list)
- cl_free(event->enqueue_cb->wait_list);
- cl_free(event->enqueue_cb);
- event->enqueue_cb = NULL;
+ /*First, we need to remove it from queue's barrier list. */
+ if (CL_EVENT_IS_BARRIER(event)) {
+ assert(event->queue);
+ cl_command_queue_remove_barrier_event(event->queue, event);
}
- }
- if(event->status >= status) //maybe changed in other threads
- event->status = status;
- pthread_mutex_unlock(&event->ctx->event_lock);
-
- /* Call user callback */
- cl_event_call_callback(event, status, CL_FALSE);
-
- if(event->type == CL_COMMAND_USER) {
- /* Check all defer enqueue */
- enqueue_callback *cb, *enqueue_cb = event->waits_head;
- while(enqueue_cb) {
- /* Remove this user event in enqueue_cb, update the header if needed. */
- cl_event_remove_user_event(&enqueue_cb->wait_user_events, event);
- cl_event_delete(event);
-
- /* Still wait on other user events */
- if(enqueue_cb->wait_user_events != NULL) {
- enqueue_cb = enqueue_cb->next;
- continue;
- }
- //remove user event frome enqueue_cb's ctx
- cl_command_queue_remove_event(enqueue_cb->event->queue, event);
- cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event);
-
- /* All user events complete, now wait enqueue events */
- ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
- enqueue_cb->event->queue);
- assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
- ret = ~ret;
- cb = enqueue_cb;
- enqueue_cb = enqueue_cb->next;
-
- /* Call the pending operation */
- evt = cb->event;
- /* TODO: if this event wait on several events, one event's
- status is error, the others is complete, what's the status
- of this event? Can't find the description in OpenCL spec.
- Simply update to latest finish wait event.*/
- cl_event_set_status(cb->event, status);
- if(evt->emplict == CL_FALSE) {
- cl_event_delete(evt);
- }
+ /* Then, notify all the queues within the same context. */
+ CL_OBJECT_LOCK(event->ctx);
+ /* Disable remove and add queue to the context temporary. We need to
+ make sure all the queues in the context currently are valid. */
+ event->ctx->queue_modify_disable++;
+ CL_OBJECT_UNLOCK(event->ctx);
+ list_for_each(pos, &event->ctx->queues)
+ {
+ queue = (cl_command_queue)(list_entry(pos, _cl_base_object, node));
+ assert(queue != NULL);
+ cl_command_queue_notify(queue);
}
- event->waits_head = NULL;
+ CL_OBJECT_LOCK(event->ctx);
+ /* Disable remove and add queue to the context temporary. We need to
+ make sure all the queues in the context currently are valid. */
+ event->ctx->queue_modify_disable--;
+ CL_OBJECT_NOTIFY_COND(event->ctx);
+ CL_OBJECT_UNLOCK(event->ctx);
}
- if(event->status <= CL_COMPLETE){
- /* Maintain the last_list when event completed*/
- if (event->last_prev)
- event->last_prev->last_next = event->last_next;
- if (event->last_next)
- event->last_next->last_prev = event->last_prev;
- if(event->queue && get_last_event(event->queue) == event)
- set_last_event(event->queue, event->last_next);
- event->last_prev = NULL;
- event->last_next = NULL;
- cl_event_delete(event);
- }
+ return CL_SUCCESS;
}
-void cl_event_update_status(cl_event event, int wait)
+LOCAL cl_int
+cl_event_wait_for_event_ready(const cl_event event)
{
- if(event->status <= CL_COMPLETE)
- return;
- if((event->gpgpu_event) &&
- (cl_gpgpu_event_update_status(event->gpgpu_event, wait) == command_complete))
- cl_event_set_status(event, CL_COMPLETE);
+ assert(CL_OBJECT_IS_EVENT(event));
+ return cl_event_wait_for_events_list(event->depend_event_num, event->depend_events);
}
-cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event* event)
+LOCAL cl_int
+cl_event_wait_for_events_list(cl_uint num_events, const cl_event *event_list)
{
- enqueue_data data = { 0 };
+ int i;
cl_event e;
+ cl_int ret = CL_SUCCESS;
- e = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
- if(e == NULL)
- return CL_OUT_OF_HOST_MEMORY;
+ for (i = 0; i < num_events; i++) {
+ e = event_list[i];
+ assert(e);
+ assert(CL_OBJECT_IS_EVENT(e));
- if(event != NULL ){
- *event = e;
- }
+ CL_OBJECT_LOCK(e);
+ while (e->status > CL_COMPLETE) {
+ CL_OBJECT_WAIT_ON_COND(e);
+ }
-//enqueues a marker command which waits for either a list of events to complete, or if the list is
-//empty it waits for all commands previously enqueued in command_queue to complete before it completes.
- if(num_events_in_wait_list > 0){
- if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
- data.type = EnqueueMarker;
- cl_event_new_enqueue_callback(event?*event:NULL, &data, num_events_in_wait_list, event_wait_list);
- return CL_SUCCESS;
+ assert(e->status <= CL_COMPLETE);
+ /* Iff some error happened, return the error. */
+ if (e->status < CL_COMPLETE) {
+ ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
}
- } else if(queue->wait_events_num > 0) {
- data.type = EnqueueMarker;
- cl_event_new_enqueue_callback(event?*event:NULL, &data, queue->wait_events_num, queue->wait_events);
- return CL_SUCCESS;
+ CL_OBJECT_UNLOCK(e);
}
- cl_event_update_last_events(queue,1);
-
- cl_event_set_status(e, CL_COMPLETE);
- return CL_SUCCESS;
+ return ret;
}
-cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
- cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list,
- cl_event* event)
+LOCAL cl_int
+cl_event_check_waitlist(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+ cl_event *event, cl_context ctx)
{
- enqueue_data data = { 0 };
- cl_event e;
+ cl_int err = CL_SUCCESS;
+ cl_int i;
- e = cl_event_new(queue->ctx, queue, CL_COMMAND_BARRIER, CL_TRUE);
- if(e == NULL)
- return CL_OUT_OF_HOST_MEMORY;
+ do {
+ /* check the event_wait_list and num_events_in_wait_list */
+ if ((event_wait_list == NULL) && (num_events_in_wait_list > 0)) {
+ err = CL_INVALID_EVENT_WAIT_LIST;
+ break;
+ }
- if(event != NULL ){
- *event = e;
- }
-//enqueues a barrier command which waits for either a list of events to complete, or if the list is
-//empty it waits for all commands previously enqueued in command_queue to complete before it completes.
- if(num_events_in_wait_list > 0){
- if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
- data.type = EnqueueBarrier;
- cl_event_new_enqueue_callback(e, &data, num_events_in_wait_list, event_wait_list);
- return CL_SUCCESS;
+ if ((event_wait_list != NULL) && (num_events_in_wait_list == 0)) {
+ err = CL_INVALID_EVENT_WAIT_LIST;
+ break;
}
- } else if(queue->wait_events_num > 0) {
- data.type = EnqueueBarrier;
- cl_event_new_enqueue_callback(e, &data, queue->wait_events_num, queue->wait_events);
- return CL_SUCCESS;
- }
- cl_event_update_last_events(queue,1);
+ /* check the event and context */
+ for (i = 0; i < num_events_in_wait_list; i++) {
+ if (!CL_OBJECT_IS_EVENT(event_wait_list[i])) {
+ err = CL_INVALID_EVENT_WAIT_LIST;
+ break;
+ }
- cl_event_set_status(e, CL_COMPLETE);
- return CL_SUCCESS;
-}
+ if (event == event_wait_list + i) { /* Pointer of element of the wait list */
+ err = CL_INVALID_EVENT_WAIT_LIST;
+ break;
+ }
-cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time)
-{
- struct timespec ts;
+ /* check all belong to same context. */
+ if (ctx == NULL) {
+ ctx = event_wait_list[i]->ctx;
+ }
+ if (event_wait_list[i]->ctx != ctx) {
+ err = CL_INVALID_CONTEXT;
+ break;
+ }
+ }
- if(clock_gettime(CLOCK_MONOTONIC_RAW,&ts) != 0){
- printf("CPU Timmer error\n");
- return CL_FALSE;
- }
- *cpu_time = (1000000000.0) * (cl_ulong) ts.tv_sec + (cl_ulong) ts.tv_nsec;
+ if (err != CL_SUCCESS)
+ break;
- return CL_SUCCESS;
+ } while (0);
+
+ return err;
}
-cl_int cl_event_get_queued_cpu_timestamp(cl_event event)
+/* When we call this function, all the events it depends
+ on should already be ready, unless ignore_depends is set. */
+LOCAL cl_uint
+cl_event_exec(cl_event event, cl_int exec_to_status, cl_bool ignore_depends)
{
- cl_int ret_val;
+ /* We are MT safe here, no one should call this
+ at the same time. No need to lock */
+ cl_int ret = CL_SUCCESS;
+ cl_int cur_status = cl_event_get_status(event);
+ cl_int depend_status;
+ cl_int s;
+
+ assert(exec_to_status >= CL_COMPLETE);
+ assert(exec_to_status <= CL_QUEUED);
+ if (cur_status < CL_COMPLETE) {
+ return cur_status;
+ }
+
+ depend_status = cl_event_is_ready(event);
+ assert(depend_status <= CL_COMPLETE || ignore_depends || exec_to_status == CL_QUEUED);
+ if (depend_status < CL_COMPLETE) { // Error happend, cancel exec.
+ ret = cl_event_set_status(event, depend_status);
+ return depend_status;
+ }
+
+ if (cur_status <= exec_to_status) {
+ return ret;
+ }
+
+ /* Exec to the target status. */
+ for (s = cur_status - 1; s >= exec_to_status; s--) {
+ assert(s >= CL_COMPLETE);
+ ret = cl_enqueue_handle(&event->exec_data, s);
+
+ if (ret != CL_SUCCESS) {
+ assert(ret < 0);
+ DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error staus is %d",
+ event, event->event_type, ret);
+ ret = cl_event_set_status(event, ret);
+ assert(ret == CL_SUCCESS);
+ return ret; // Failed and we never do further.
+ } else {
+ assert(!CL_EVENT_IS_USER(event));
+ if ((event->queue->props & CL_QUEUE_PROFILING_ENABLE) != 0) {
+ /* record the timestamp before actually doing something. */
+ cl_event_update_timestamp(event, s);
+ }
- ret_val = cl_event_get_cpu_timestamp(&event->queued_timestamp);
+ ret = cl_event_set_status(event, s);
+ assert(ret == CL_SUCCESS);
+ }
+ }
- return ret_val;
+ return ret;
}
-cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong end_timestamp)
+/* 0 means ready, >0 means not ready, <0 means error. */
+LOCAL cl_int
+cl_event_is_ready(cl_event event)
{
- cl_ulong ret_val;
-
- if(end_timestamp > start_timestamp){
- ret_val = end_timestamp - start_timestamp;
- }
- else {
- /*if start time stamp is greater than end timstamp then set ret value to max*/
- ret_val = ((cl_ulong) 1 << 32);
- }
+ int i;
+ int status;
+ int ret_status = CL_COMPLETE;
- return ret_val;
-}
+ for (i = 0; i < event->depend_event_num; i++) {
+ status = cl_event_get_status(event->depend_events[i]);
-cl_ulong cl_event_get_start_timestamp(cl_event event)
-{
- cl_ulong ret_val;
+ if (status > CL_COMPLETE) { // Find some not ready, just OK
+ return status;
+ }
- ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[2]);
+ if (status < CL_COMPLETE) { // Record some error.
+ ret_status = status;
+ }
+ }
- return ret_val;
+ return ret_status;
}
-cl_ulong cl_event_get_end_timestamp(cl_event event)
+LOCAL cl_event
+cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_bool is_barrier, cl_int *error)
{
- cl_ulong ret_val;
+ cl_event e = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_command_type type = CL_COMMAND_MARKER;
+ enqueue_type eq_type = EnqueueMarker;
- ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[3]);
+ if (is_barrier) {
+ type = CL_COMMAND_BARRIER;
+ eq_type = EnqueueBarrier;
+ }
- return ret_val;
-}
+ if (event_wait_list) {
+ assert(num_events_in_wait_list > 0);
-cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
-{
- cl_ulong ret_val = 0;
- GET_QUEUE_THREAD_GPGPU(event->queue);
+ e = cl_event_create(queue->ctx, queue, num_events_in_wait_list,
+ event_wait_list, type, &err);
+ if (err != CL_SUCCESS) {
+ *error = err;
+ return NULL;
+ }
+ } else { /* The marker depends on all events in the queue now. */
+ cl_command_queue_enqueue_worker worker = &queue->worker;
+ cl_uint i;
+ cl_uint event_num;
+ cl_event *depend_events;
+
+ CL_OBJECT_LOCK(queue);
+
+ /* First, wait for the command queue retire all in executing event. */
+ while (1) {
+ if (worker->quit) { // already destroy the queue?
+ CL_OBJECT_UNLOCK(queue);
+ *error = CL_INVALID_COMMAND_QUEUE;
+ return NULL;
+ }
- if (!event->gpgpu_event) {
- cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
- event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
- return CL_SUCCESS;
- }
+ if (worker->in_exec_status != CL_COMPLETE) {
+ CL_OBJECT_WAIT_ON_COND(queue);
+ continue;
+ }
- if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
- param_name == CL_PROFILING_COMMAND_QUEUED) {
- cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
- event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
- return CL_SUCCESS;
- } else if(param_name == CL_PROFILING_COMMAND_START) {
- cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 0, &ret_val);
- event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
- return CL_SUCCESS;
- } else if (param_name == CL_PROFILING_COMMAND_END) {
- cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 1, &ret_val);
- event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
- return CL_SUCCESS;
- }
- return CL_INVALID_VALUE;
-}
+ break;
+ }
-cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event)
-{
- user_event * u_iter = *p_u_ev;
- user_event * u_ev;
-
- while(u_iter)
- {
- if(u_iter->event == event)
- return CL_SUCCESS;
- u_iter = u_iter->next;
- }
+ event_num = 0;
+ depend_events = NULL;
+ if (!list_empty(&worker->enqueued_events)) {
+ depend_events = cl_command_queue_record_in_queue_events(queue, &event_num);
+ }
- TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
- u_ev->event = event;
- u_ev->next = *p_u_ev;
- *p_u_ev = u_ev;
+ CL_OBJECT_UNLOCK(queue);
+ e = cl_event_create(queue->ctx, queue, event_num, depend_events, type, &err);
- return CL_SUCCESS;
-error:
- return CL_FALSE;
-}
+ for (i = 0; i < event_num; i++) { //unref the temp
+ cl_event_delete(depend_events[i]);
+ }
+ if (depend_events)
+ cl_free(depend_events);
-cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event)
-{
- user_event * u_iter = *p_u_ev;
- user_event * u_prev = *p_u_ev;
-
- while(u_iter){
- if(u_iter->event == event ){
- if(u_iter == *p_u_ev){
- *p_u_ev = u_iter->next;
- }else{
- u_prev->next = u_iter->next;
- }
- cl_free(u_iter);
- break;
+ if (err != CL_SUCCESS) {
+ *error = err;
+ return NULL;
}
- u_prev = u_iter;
- u_iter = u_iter->next;
}
- return CL_SUCCESS;
+ e->exec_data.type = eq_type;
+ *error = CL_SUCCESS;
+ return e;
}
diff --git a/src/cl_event.h b/src/cl_event.h
index 67fab19..f28844a 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -14,111 +14,75 @@
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <http://www.gnu.org/licenses/>.
*
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
*/
-#ifndef __CL_EVENT_H__
-#define __CL_EVENT_H__
+#ifndef __CL_EVENT_H_
+#define __CL_EVENT_H_
#include <semaphore.h>
-#include "cl_internals.h"
-#include "cl_driver.h"
+#include "cl_base_object.h"
#include "cl_enqueue.h"
#include "CL/cl.h"
-#define CL_ENQUEUE_EXECUTE_IMM 0
-#define CL_ENQUEUE_EXECUTE_DEFER 1
+typedef void(CL_CALLBACK *cl_event_notify_cb)(cl_event event, cl_int event_command_exec_status, void *user_data);
-typedef struct _user_event {
- cl_event event; /* The user event */
- struct _user_event* next; /* Next user event in list */
-} user_event;
+typedef struct _cl_event_user_callback {
+ cl_int status; /* The execution status */
+ cl_bool executed; /* Indicat the callback function been called or not */
+ cl_event_notify_cb pfn_notify; /* Callback function */
+ void *user_data; /* Callback user data */
+ list_node node; /* Event callback list node */
+} _cl_event_user_callback;
-typedef struct _enqueue_callback {
- cl_event event; /* The event relative this enqueue callback */
- enqueue_data data; /* Hold all enqueue callback's infomation */
- cl_uint num_events; /* num events in wait list */
- cl_event* wait_list; /* All event wait list this callback wait on */
- user_event* wait_user_events; /* The head of user event list the callback wait on */
- struct _enqueue_callback* next; /* The next enqueue callback in wait list */
-} enqueue_callback;
+typedef _cl_event_user_callback *cl_event_user_callback;
-typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int event_command_exec_status, void *user_data);
+typedef struct _cl_event {
+ _cl_base_object base;
+ cl_context ctx; /* The context associated with event */
+ cl_command_queue queue; /* The command queue associated with event */
+ cl_command_type event_type; /* Event type. */
+ cl_bool is_barrier; /* Is this event a barrier */
+ cl_int status; /* The execution status */
+ cl_event *depend_events; /* The events must complete before this. */
+ cl_uint depend_event_num; /* The depend events number. */
+ list_head callbacks; /* The events The event callback functions */
+ list_node enqueue_node; /* The node in the enqueue list. */
+ cl_ulong timestamp[5]; /* The time stamps for profiling. */
+ enqueue_data exec_data; /* Context for execute this event. */
+} _cl_event;
-typedef struct _user_callback {
- cl_int status; /* The execution status */
- cl_bool executed; /* Indicat the callback function been called or not */
- EVENT_NOTIFY pfn_notify; /* Callback function */
- void* user_data; /* Callback user data */
- struct _user_callback* next; /* Next event callback in list */
-} user_callback;
+#define CL_OBJECT_EVENT_MAGIC 0x8324a9f810ebf90fLL
+#define CL_OBJECT_IS_EVENT(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_EVENT_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
-struct _cl_event {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a sampler object */
- volatile int ref_n; /* We reference count this object */
- cl_context ctx; /* The context associated with event */
- cl_event prev, next; /* We chain the memory buffers together */
- cl_command_queue queue; /* The command queue associated with event */
- cl_command_type type; /* The command type associated with event */
- cl_int status; /* The execution status */
- cl_gpgpu gpgpu; /* Current gpgpu, owned by this structure. */
- cl_gpgpu_event gpgpu_event; /* The event object communicate with hardware */
- user_callback* user_cb; /* The event callback functions */
- enqueue_callback* enqueue_cb; /* This event's enqueue */
- enqueue_callback* waits_head; /* The head of enqueues list wait on this event */
- cl_bool emplict; /* Identify this event whether created by api emplict*/
- cl_ulong timestamp[4];/* The time stamps for profiling. */
- cl_ulong queued_timestamp;
- cl_event last_next, last_prev;/* We need a list to monitor untouchable api event*/
-};
+#define CL_EVENT_STATE_UNKNOWN 0x4
+
+#define CL_EVENT_IS_MARKER(E) (E->event_type == CL_COMMAND_MARKER)
+#define CL_EVENT_IS_BARRIER(E) (E->event_type == CL_COMMAND_BARRIER)
+#define CL_EVENT_IS_USER(E) (E->event_type == CL_COMMAND_USER)
+
+#define CL_EVENT_INVALID_TIMESTAMP 0xFFFFFFFFFFFFFFFF
/* Create a new event object */
-cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
-/* Unref the object and delete it if no more reference on it */
-void cl_event_delete(cl_event);
-/* Add one more reference to this object */
-void cl_event_add_ref(cl_event);
-/* Register a user callback function for specific commond execution status */
-cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
-/* Execute the event's callback if the event's status supersedes the callback's status. Free the callback if specified */
-void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb);
-/* Check events wait list for enqueue commonds */
-cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
-/* Wait the all events in wait list complete */
-cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue);
-/* New a enqueue suspend task */
-void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
-/* Set the event status and call all callbacks */
-void cl_event_set_status(cl_event, cl_int);
-/* Check and update event status */
-void cl_event_update_status(cl_event, cl_int);
-/* Create the marker event */
-cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event *, cl_event*);
-/* Create the barrier event */
-cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *, cl_event*);
-/* Get the cpu time */
-cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time);
-/*Get the cpu time for queued*/
-cl_int cl_event_get_queued_cpu_timestamp(cl_event event);
-/*get timestamp delate between end and start*/
-cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong end_timestamp);
-/*Get start time stamp*/
-cl_ulong cl_event_get_start_timestamp(cl_event event);
-/*Get end time stamp*/
-cl_ulong cl_event_get_end_timestamp(cl_event event);
-/* Do the event profiling */
-cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
-/* insert the user event */
-cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
-/* remove the user event */
-cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event);
-/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */
-cl_int cl_event_flush(cl_event event);
-/* monitor or block wait all events in the last_event list */
-void cl_event_update_last_events(cl_command_queue queuet, int wait);
-/* insert the event into the last_event list in queue */
-void cl_event_insert_last_events(cl_command_queue queue, cl_event event);
+extern cl_event cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint num_events,
+ const cl_event *event_list, cl_command_type type, cl_int *errcode_ret);
+extern cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+ cl_event* event, cl_context ctx);
+extern cl_uint cl_event_exec(cl_event event, cl_int exec_to_status, cl_bool ignore_depends);
+/* 0 means ready, >0 means not ready, <0 means error. */
+extern cl_int cl_event_is_ready(cl_event event);
+extern cl_int cl_event_get_status(cl_event event);
+extern void cl_event_add_ref(cl_event event);
+extern void cl_event_delete(cl_event event);
+extern cl_int cl_event_set_status(cl_event event, cl_int status);
+extern cl_int cl_event_set_callback(cl_event event, cl_int exec_type,
+ cl_event_notify_cb pfn_notify, void *user_data);
+extern cl_int cl_event_wait_for_events_list(cl_uint num_events, const cl_event *event_list);
+extern cl_int cl_event_wait_for_event_ready(cl_event event);
+extern cl_event cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_bool is_barrier,
+ cl_int* error);
+extern void cl_event_update_timestamp(cl_event event, cl_int status);
#endif /* __CL_EVENT_H__ */
-
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 40b6ddc..a3c71ca 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -1,5 +1,5 @@
#include "llvm/Config/llvm-config.h"
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL
#include "EGL/egl.h"
#include "EGL/eglext.h"
#endif
@@ -55,7 +55,7 @@ void check_opt1_extension(cl_extensions_t *extensions)
void
check_gl_extension(cl_extensions_t *extensions) {
-#if defined(HAS_EGL)
+#if defined(HAS_GL_EGL)
int id;
/* For now, we only support cl_khr_gl_sharing. */
for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
@@ -152,6 +152,7 @@ cl_intel_platform_extension_init(cl_platform_id intel_platform)
static int ext_initialized = 0;
/* The EXT should be only inited once. */
+ (void) ext_initialized;
assert(!ext_initialized);
check_basic_extension(&intel_platform_extensions);
check_opt1_extension(&intel_platform_extensions);
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 1139775..52a4953 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -9,6 +9,9 @@
DECL_EXT(khr_local_int32_base_atomics) \
DECL_EXT(khr_local_int32_extended_atomics) \
DECL_EXT(khr_byte_addressable_store) \
+ DECL_EXT(khr_3d_image_writes)\
+ DECL_EXT(khr_image2d_from_buffer)\
+ DECL_EXT(khr_depth_images)\
DECL_EXT(khr_fp64)
/* The OPT1 extensions are those optional extensions
@@ -16,19 +19,17 @@
#define DECL_OPT1_EXTENSIONS \
DECL_EXT(khr_int64_base_atomics)\
DECL_EXT(khr_int64_extended_atomics)\
- DECL_EXT(khr_3d_image_writes)\
DECL_EXT(khr_fp16)\
- DECL_EXT(khr_image2d_from_buffer)\
DECL_EXT(khr_initialize_memory)\
DECL_EXT(khr_context_abort)\
- DECL_EXT(khr_depth_images)\
DECL_EXT(khr_spir) \
DECL_EXT(khr_icd)
#define DECL_INTEL_EXTENSIONS \
DECL_EXT(intel_accelerator) \
DECL_EXT(intel_motion_estimation) \
- DECL_EXT(intel_subgroups)
+ DECL_EXT(intel_subgroups) \
+ DECL_EXT(intel_subgroups_short)
#define DECL_GL_EXTENSIONS \
DECL_EXT(khr_gl_sharing)\
@@ -63,7 +64,7 @@ cl_khr_extension_id_max
#define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
#define OPT1_EXT_END_ID EXT_ID(khr_icd)
#define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
-#define INTEL_EXT_END_ID EXT_ID(intel_subgroups)
+#define INTEL_EXT_END_ID EXT_ID(intel_subgroups_short)
#define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
#define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index aa13a3d..f190b0d 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -38,10 +38,13 @@ gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL;
gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL;
gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL;
gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL;
+gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count = NULL;
+gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table = NULL;
gbe_program_delete_cb *interp_program_delete = NULL;
gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
gbe_program_get_kernel_cb *interp_program_get_kernel = NULL;
+gbe_program_get_device_enqueue_kernel_name_cb *interp_program_get_device_enqueue_kernel_name = NULL;
gbe_kernel_get_name_cb *interp_kernel_get_name = NULL;
gbe_kernel_get_attributes_cb *interp_kernel_get_attributes = NULL;
gbe_kernel_get_code_cb *interp_kernel_get_code = NULL;
@@ -64,6 +67,7 @@ gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data = NULL;
gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size = NULL;
gbe_kernel_get_image_size_cb *interp_kernel_get_image_size = NULL;
gbe_kernel_get_image_data_cb *interp_kernel_get_image_data = NULL;
+gbe_kernel_get_ocl_version_cb *interp_kernel_get_ocl_version = NULL;
gbe_output_profiling_cb* interp_output_profiling = NULL;
gbe_get_profiling_bti_cb* interp_get_profiling_bti = NULL;
gbe_dup_profiling_cb* interp_dup_profiling = NULL;
@@ -73,6 +77,7 @@ gbe_dup_printfset_cb* interp_dup_printfset = NULL;
gbe_release_printf_info_cb* interp_release_printf_info = NULL;
gbe_output_printf_cb* interp_output_printf = NULL;
gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info = NULL;
+gbe_kernel_use_device_enqueue_cb *interp_kernel_use_device_enqueue = NULL;
struct GbeLoaderInitializer
{
@@ -110,6 +115,14 @@ struct GbeLoaderInitializer
if (interp_program_get_global_constant_data == NULL)
return false;
+ interp_program_get_global_reloc_count = *(gbe_program_get_global_reloc_count_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_count");
+ if (interp_program_get_global_reloc_count == NULL)
+ return false;
+
+ interp_program_get_global_reloc_table = *(gbe_program_get_global_reloc_table_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_table");
+ if (interp_program_get_global_reloc_table == NULL)
+ return false;
+
interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete");
if (interp_program_delete == NULL)
return false;
@@ -126,6 +139,10 @@ struct GbeLoaderInitializer
if (interp_program_get_kernel == NULL)
return false;
+ interp_program_get_device_enqueue_kernel_name = *(gbe_program_get_device_enqueue_kernel_name_cb**)dlsym(dlhInterp, "gbe_program_get_device_enqueue_kernel_name");
+ if (interp_program_get_device_enqueue_kernel_name == NULL)
+ return false;
+
interp_kernel_get_name = *(gbe_kernel_get_name_cb**)dlsym(dlhInterp, "gbe_kernel_get_name");
if (interp_kernel_get_name == NULL)
return false;
@@ -214,6 +231,10 @@ struct GbeLoaderInitializer
if (interp_kernel_get_image_data == NULL)
return false;
+ interp_kernel_get_ocl_version = *(gbe_kernel_get_ocl_version_cb**)dlsym(dlhInterp, "gbe_kernel_get_ocl_version");
+ if (interp_kernel_get_ocl_version == NULL)
+ return false;
+
interp_output_profiling = *(gbe_output_profiling_cb**)dlsym(dlhInterp, "gbe_output_profiling");
if (interp_output_profiling == NULL)
return false;
@@ -250,6 +271,10 @@ struct GbeLoaderInitializer
if (interp_kernel_get_arg_info == NULL)
return false;
+ interp_kernel_use_device_enqueue = *(gbe_kernel_use_device_enqueue_cb**)dlsym(dlhInterp, "gbe_kernel_use_device_enqueue");
+ if (interp_kernel_use_device_enqueue == NULL)
+ return false;
+
return true;
}
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index df808a5..df885d2 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -38,10 +38,13 @@ extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource;
extern gbe_program_new_from_binary_cb *interp_program_new_from_binary;
extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size;
extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data;
+extern gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count;
+extern gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table;
extern gbe_program_delete_cb *interp_program_delete;
extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num;
extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name;
extern gbe_program_get_kernel_cb *interp_program_get_kernel;
+extern gbe_program_get_device_enqueue_kernel_name_cb *interp_program_get_device_enqueue_kernel_name;
extern gbe_kernel_get_name_cb *interp_kernel_get_name;
extern gbe_kernel_get_attributes_cb *interp_kernel_get_attributes;
extern gbe_kernel_get_code_cb *interp_kernel_get_code;
@@ -64,6 +67,7 @@ extern gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data;
extern gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size;
extern gbe_kernel_get_image_size_cb *interp_kernel_get_image_size;
extern gbe_kernel_get_image_data_cb *interp_kernel_get_image_data;
+extern gbe_kernel_get_ocl_version_cb *interp_kernel_get_ocl_version;
extern gbe_output_profiling_cb* interp_output_profiling;
extern gbe_get_profiling_bti_cb* interp_get_profiling_bti;
extern gbe_dup_profiling_cb* interp_dup_profiling;
@@ -73,6 +77,7 @@ extern gbe_dup_printfset_cb* interp_dup_printfset;
extern gbe_release_printf_info_cb* interp_release_printf_info;
extern gbe_output_printf_cb* interp_output_printf;
extern gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info;
+extern gbe_kernel_use_device_enqueue_cb * interp_kernel_use_device_enqueue;
int CompilerSupported();
#ifdef __cplusplus
diff --git a/src/cl_gl_api.c b/src/cl_gl_api.c
index 519aab6..897edb4 100644
--- a/src/cl_gl_api.c
+++ b/src/cl_gl_api.c
@@ -20,7 +20,7 @@
#include <stdio.h>
#include <string.h>
#include <assert.h>
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL
#include <GL/gl.h>
#endif
@@ -95,16 +95,7 @@ clCreateFromGLTexture3D(cl_context context,
GLuint texture,
cl_int * errcode_ret)
{
- cl_mem mem = NULL;
- cl_int err = CL_SUCCESS;
- CHECK_CONTEXT (context);
- CHECK_GL_CONTEXT (context);
-
- mem = cl_mem_new_gl_texture(context, flags, texture_target, miplevel, texture, &err);
-error:
- if (errcode_ret)
- *errcode_ret = err;
- return mem;
+ NOT_IMPLEMENTED;
}
cl_mem
@@ -120,6 +111,12 @@ clCreateFromGLTexture(cl_context context,
CHECK_CONTEXT (context);
CHECK_GL_CONTEXT (context);
+ //We just support GL_TEXTURE_2D now.
+ if(target != GL_TEXTURE_2D){
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
mem = cl_mem_new_gl_texture(context, flags, target, miplevel, texture, &err);
error:
if (errcode_ret)
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 70a0a54..cf5ad7a 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -39,10 +39,19 @@
.native_vector_width_float = 4,
.native_vector_width_double = 2,
.native_vector_width_half = 8,
+#ifdef ENABLE_OPENCL_20
+.address_bits = 64,
+#else
.address_bits = 32,
+#endif
+.svm_capabilities = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER,
+.preferred_platform_atomic_alignment = 0,
+.preferred_global_atomic_alignment = 0,
+.preferred_local_atomic_alignment = 0,
.image_support = CL_TRUE,
.max_read_image_args = BTI_MAX_READ_IMAGE_ARGS,
.max_write_image_args = BTI_MAX_WRITE_IMAGE_ARGS,
+.max_read_write_image_args = BTI_MAX_WRITE_IMAGE_ARGS,
.image_max_array_size = 2048,
.image2d_max_width = 8192,
.image2d_max_height = 8192,
@@ -53,10 +62,15 @@
.max_samplers = 16,
.mem_base_addr_align = sizeof(cl_long) * 16 * 8,
.min_data_type_align_size = sizeof(cl_long) * 16,
+.max_pipe_args = 16,
+.pipe_max_active_reservations = 1,
+.pipe_max_packet_siz = 1024,
.double_fp_config = 0,
.global_mem_cache_type = CL_READ_WRITE_CACHE,
.max_constant_buffer_size = 128 * 1024 * 1024,
.max_constant_args = 8,
+.max_global_variable_size = 64 * 1024,
+.global_variable_preferred_total_size = 64 * 1024,
.error_correction_support = CL_FALSE,
#ifdef HAS_USERPTR
.host_unified_memory = CL_TRUE,
@@ -70,6 +84,12 @@
.linker_available = CL_TRUE,
.execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
.queue_properties = CL_QUEUE_PROFILING_ENABLE,
+.queue_on_host_properties = CL_QUEUE_PROFILING_ENABLE,
+.queue_on_device_properties = CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+.queue_on_device_preferred_size = 16 * 1024,
+.queue_on_device_max_size = 256 * 1024,
+.max_on_device_queues = 1,
+.max_on_device_events = 1024,
.platform = NULL, /* == intel_platform (set when requested) */
/* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
.single_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
@@ -127,7 +147,6 @@ DECL_INFO_STRING(spir_versions, "1.2")
.partition_property = {0},
.affinity_domain = 0,
.partition_type = {0},
-.device_reference_count = 1,
.image_pitch_alignment = 1,
.image_base_address_alignment = 4096,
.cmrt_device = NULL
diff --git a/src/cl_image.c b/src/cl_image.c
index d58bdf3..d059304 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -91,6 +91,13 @@ cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp)
return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
*bpp *= 4;
break;
+ case CL_sRGBA:
+ case CL_sBGRA:
+ if (type != CL_UNORM_INT8)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ *bpp *= 4;
+ break;
+
default: return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
};
@@ -189,13 +196,23 @@ cl_image_get_intel_format(const cl_image_format *fmt)
case CL_UNORM_INT8: return I965_SURFACEFORMAT_B8G8R8A8_UNORM;
default: return INTEL_UNSUPPORTED_FORMAT;
};
+ case CL_sRGBA:
+ switch (type) {
+ case CL_UNORM_INT8: return I965_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
+ case CL_sBGRA:
+ switch (type) {
+ case CL_UNORM_INT8: return I965_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
default: return INTEL_UNSUPPORTED_FORMAT;
};
}
static const uint32_t cl_image_order[] = {
CL_R, CL_A, CL_RG, CL_RA, CL_RGB, CL_RGBA, CL_BGRA, CL_ARGB,
- CL_INTENSITY, CL_LUMINANCE, CL_Rx, CL_RGx, CL_RGBx
+ CL_INTENSITY, CL_LUMINANCE, CL_Rx, CL_RGx, CL_RGBx, CL_sRGBA, CL_sBGRA
};
static const uint32_t cl_image_type[] = {
@@ -211,6 +228,7 @@ static const size_t cl_image_type_n = SIZEOF32(cl_image_type);
cl_int
cl_image_get_supported_fmt(cl_context ctx,
+ cl_mem_flags flags,
cl_mem_object_type image_type,
cl_uint num_entries,
cl_image_format *image_formats,
@@ -224,6 +242,10 @@ cl_image_get_supported_fmt(cl_context ctx,
.image_channel_data_type = cl_image_type[j]
};
const uint32_t intel_fmt = cl_image_get_intel_format(&fmt);
+ if (cl_image_order[i] >= CL_sRGBA &&
+ ((flags & CL_MEM_WRITE_ONLY) || (flags & CL_MEM_READ_WRITE) ||
+ (flags & CL_MEM_KERNEL_READ_AND_WRITE)))
+ continue;
if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
continue;
if (n < num_entries && image_formats) image_formats[n] = fmt;
diff --git a/src/cl_image.h b/src/cl_image.h
index ae74509..4f0d0f1 100644
--- a/src/cl_image.h
+++ b/src/cl_image.h
@@ -35,6 +35,7 @@ extern uint32_t cl_image_get_intel_format(const cl_image_format *fmt);
/* Return the list of formats supported by the API */
extern cl_int cl_image_get_supported_fmt(cl_context context,
+ cl_mem_flags flags,
cl_mem_object_type image_type,
cl_uint num_entries,
cl_image_format *image_formats,
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index b380abe..f687084 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -45,14 +45,16 @@ cl_kernel_delete(cl_kernel k)
#ifdef HAS_CMRT
if (k->cmrt_kernel != NULL) {
cmrt_destroy_kernel(k);
- k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ CL_OBJECT_DESTROY_BASE(k);
cl_free(k);
return;
}
#endif
/* We are not done with the kernel */
- if (atomic_dec(&k->ref_n) > 1) return;
+ if (CL_OBJECT_DEC_REF(k) > 1)
+ return;
+
/* Release one reference on all bos we own */
if (k->bo) cl_buffer_unreference(k->bo);
/* This will be true for kernels created by clCreateKernel */
@@ -68,7 +70,17 @@ cl_kernel_delete(cl_kernel k)
}
if (k->image_sz)
cl_free(k->images);
- k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+
+ if (k->exec_info)
+ cl_free(k->exec_info);
+
+ if (k->device_enqueue_ptr)
+ cl_mem_svm_delete(k->program->ctx, k->device_enqueue_ptr);
+ if (k->device_enqueue_infos)
+ cl_free(k->device_enqueue_infos);
+
+ CL_OBJECT_DESTROY_BASE(k);
+
cl_free(k);
}
@@ -77,9 +89,7 @@ cl_kernel_new(cl_program p)
{
cl_kernel k = NULL;
TRY_ALLOC_NO_ERR (k, CALLOC(struct _cl_kernel));
- SET_ICD(k->dispatch)
- k->ref_n = 1;
- k->magic = CL_MAGIC_KERNEL_HEADER;
+ CL_OBJECT_INIT_BASE(k, CL_OBJECT_KERNEL_MAGIC);
k->program = p;
k->cmrt_kernel = NULL;
@@ -108,7 +118,7 @@ cl_kernel_get_attributes(cl_kernel k)
LOCAL void
cl_kernel_add_ref(cl_kernel k)
{
- atomic_inc(&k->ref_n);
+ CL_OBJECT_INC_REF(k);
}
LOCAL cl_int
@@ -156,16 +166,23 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
return CL_INVALID_ARG_VALUE;
cl_sampler s = *(cl_sampler*)value;
- if(s->magic != CL_MAGIC_SAMPLER_HEADER)
+ if(!CL_OBJECT_IS_SAMPLER(s))
return CL_INVALID_SAMPLER;
} else {
// should be image, GLOBAL_PTR, CONSTANT_PTR
- if (UNLIKELY(value == NULL && arg_type == GBE_ARG_IMAGE))
+ if (UNLIKELY(value == NULL && (arg_type == GBE_ARG_IMAGE ||
+ arg_type == GBE_ARG_PIPE)))
return CL_INVALID_ARG_VALUE;
if(value != NULL)
mem = *(cl_mem*)value;
+ if(arg_type == GBE_ARG_PIPE) {
+ _cl_mem_pipe* pipe= cl_mem_pipe(mem);
+ size_t type_size = (size_t)interp_kernel_get_arg_info(k->opaque, index,5);
+ if(pipe->packet_size != type_size)
+ return CL_INVALID_ARG_VALUE;
+ }
if(value != NULL && mem) {
- if( CL_SUCCESS != is_valid_mem(mem, ctx->buffers))
+ if(CL_SUCCESS != cl_mem_is_valid(mem, ctx))
return CL_INVALID_MEM_OBJECT;
if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
@@ -252,11 +269,62 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
cl_mem_delete(k->args[index].mem);
k->args[index].mem = mem;
k->args[index].is_set = 1;
+ k->args[index].is_svm = mem->is_svm;
+ if(mem->is_svm)
+ k->args[index].ptr = mem->host_ptr;
k->args[index].local_sz = 0;
k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
return CL_SUCCESS;
}
+
+LOCAL cl_int
+cl_kernel_set_arg_svm_pointer(cl_kernel k, cl_uint index, const void *value)
+{
+ enum gbe_arg_type arg_type; /* kind of argument */
+ //size_t arg_sz; /* size of the argument */
+ cl_context ctx = k->program->ctx;
+ cl_mem mem= cl_context_get_svm_from_ptr(ctx, value);
+
+ if (UNLIKELY(index >= k->arg_n))
+ return CL_INVALID_ARG_INDEX;
+ arg_type = interp_kernel_get_arg_type(k->opaque, index);
+ //arg_sz = interp_kernel_get_arg_size(k->opaque, index);
+
+ if(arg_type != GBE_ARG_GLOBAL_PTR && arg_type != GBE_ARG_CONSTANT_PTR )
+ return CL_INVALID_ARG_VALUE;
+
+ if(mem == NULL)
+ return CL_INVALID_ARG_VALUE;
+
+ cl_mem_add_ref(mem);
+ if (k->args[index].mem)
+ cl_mem_delete(k->args[index].mem);
+
+ k->args[index].ptr = (void *)value;
+ k->args[index].mem = mem;
+ k->args[index].is_set = 1;
+ k->args[index].is_svm = 1;
+ k->args[index].local_sz = 0;
+ k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
+ return 0;
+}
+
+LOCAL cl_int
+cl_kernel_set_exec_info(cl_kernel k, size_t n, const void *value)
+{
+ cl_int err = CL_SUCCESS;
+ assert(k != NULL);
+
+ if (n == 0) return err;
+ TRY_ALLOC(k->exec_info, cl_calloc(n, 1));
+ memcpy(k->exec_info, value, n);
+ k->exec_info_n = n / sizeof(void *);
+
+error:
+ return err;
+}
+
LOCAL int
cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret)
@@ -275,13 +343,13 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
if (!param_value) return CL_SUCCESS;
if (param_value_size < sizeof(cl_kernel_arg_address_qualifier))
return CL_INVALID_VALUE;
- if ((cl_ulong)ret_info == 0) {
+ if ((size_t)ret_info == 0) {
*(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE;
- } else if ((cl_ulong)ret_info == 1 || (cl_ulong)ret_info == 4) {
+ } else if ((size_t)ret_info == 1 || (size_t)ret_info == 4) {
*(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_GLOBAL;
- } else if ((cl_ulong)ret_info == 2) {
+ } else if ((size_t)ret_info == 2) {
*(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_CONSTANT;
- } else if ((cl_ulong)ret_info == 3) {
+ } else if ((size_t)ret_info == 3) {
*(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
} else {
/* If no address qualifier is specified, the default address qualifier
@@ -334,6 +402,8 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
if (strstr((char*)ret_info, "restrict"))
type_qual = type_qual | CL_KERNEL_ARG_TYPE_RESTRICT;
+ if (strstr((char*)ret_info, "pipe"))
+ type_qual = CL_KERNEL_ARG_TYPE_PIPE;
*(cl_kernel_arg_type_qualifier *)param_value = type_qual;
return CL_SUCCESS;
@@ -371,7 +441,8 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
k->opaque = opaque;
const char* kname = cl_kernel_get_name(k);
- if (strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
+ if (kname != NULL &&
+ strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
k->vme = 1;
else
k->vme = 0;
@@ -389,7 +460,7 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
/* Get image data & size */
k->image_sz = interp_kernel_get_image_size(k->opaque);
assert(k->sampler_sz <= GEN_MAX_SURFACES);
- assert(k->image_sz <= ctx->device->max_read_image_args + ctx->device->max_write_image_args);
+ assert(k->image_sz <= ctx->devices[0]->max_read_image_args + ctx->devices[0]->max_write_image_args);
if (k->image_sz > 0) {
TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
interp_kernel_get_image_data(k->opaque, k->images);
@@ -409,17 +480,16 @@ cl_kernel_dup(cl_kernel from)
if (UNLIKELY(from == NULL))
return NULL;
TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
- SET_ICD(to->dispatch)
+ CL_OBJECT_INIT_BASE(to, CL_OBJECT_KERNEL_MAGIC);
to->bo = from->bo;
to->opaque = from->opaque;
to->vme = from->vme;
- to->ref_n = 1;
- to->magic = CL_MAGIC_KERNEL_HEADER;
to->program = from->program;
to->arg_n = from->arg_n;
to->curbe_sz = from->curbe_sz;
to->sampler_sz = from->sampler_sz;
to->image_sz = from->image_sz;
+ to->exec_info_n = from->exec_info_n;
memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
to->stack_size = from->stack_size;
if (to->sampler_sz)
@@ -429,6 +499,10 @@ cl_kernel_dup(cl_kernel from)
memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
} else
to->images = NULL;
+ if (to->exec_info_n) { /* Must always 0 here */
+ TRY_ALLOC_NO_ERR(to->exec_info, cl_calloc(to->exec_info_n, sizeof(void *)));
+ memcpy(to->exec_info, from->exec_info, to->exec_info_n * sizeof(void *));
+ }
TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 05a882e..8acd82a 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -21,6 +21,7 @@
#define __CL_KERNEL_H__
#include "cl_internals.h"
+#include "cl_base_object.h"
#include "cl_driver.h"
#include "cl_gbe_loader.h"
#include "CL/cl.h"
@@ -40,15 +41,15 @@ typedef struct cl_argument {
cl_sampler sampler; /* For sampler. */
cl_accelerator_intel accel;
unsigned char bti;
- uint32_t local_sz:31; /* For __local size specification */
+ void *ptr; /* SVM ptr value. */
+ uint32_t local_sz:30; /* For __local size specification */
uint32_t is_set:1; /* All args must be set before NDRange */
+ uint32_t is_svm:1; /* Indicate this argument is SVMPointer */
} cl_argument;
/* One OCL function */
struct _cl_kernel {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a kernel */
- volatile int ref_n; /* We reference count this object */
+ _cl_base_object base;
cl_buffer bo; /* The code itself */
cl_program program; /* Owns this structure (and pointers) */
gbe_kernel opaque; /* (Opaque) compiler structure for the OCL kernel */
@@ -71,8 +72,19 @@ struct _cl_kernel {
uint32_t vme:1; /* True only if it is a built-in kernel for VME */
void* cmrt_kernel; /* CmKernel* */
+ uint32_t exec_info_n; /* The kernel's exec info count */
+ void** exec_info; /* The kernel's exec info */
+ cl_bool useDeviceEnqueue; /* kernel use device enqueue */
+ void* device_enqueue_ptr; /* device_enqueue buffer*/
+ uint32_t device_enqueue_info_n; /* count of parent kernel's arguments buffers, as child enqueues' exec info */
+ void** device_enqueue_infos; /* parent kernel's arguments buffers, as child enqueues' exec info */
};
+#define CL_OBJECT_KERNEL_MAGIC 0x1234567890abedefLL
+#define CL_OBJECT_IS_KERNEL(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_KERNEL_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
+
/* Allocate an empty kernel */
extern cl_kernel cl_kernel_new(cl_program);
@@ -104,6 +116,12 @@ extern int cl_kernel_set_arg(cl_kernel,
uint32_t arg_index,
size_t arg_size,
const void *arg_value);
+extern int cl_kernel_set_arg_svm_pointer(cl_kernel,
+ uint32_t arg_index,
+ const void *arg_value);
+extern cl_int cl_kernel_set_exec_info(cl_kernel k,
+ size_t n,
+ const void *value);
/* Get the argument information */
extern int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index,
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index 84b4beb..7b3600c 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -169,23 +169,23 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
(void *) NULL,
(void *) NULL,
(void *) NULL,
-#if (OCL_ICD_IDENTIFIED_FUNCTIONS > 110)
(void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) NULL,
- (void *) clGetKernelSubGroupInfoKHR,
#endif
+#ifdef CL_VERSION_2_0
+ clCreateCommandQueueWithProperties,
+ clCreatePipe,
+ clGetPipeInfo,
+ clSVMAlloc,
+ clSVMFree,
+ clEnqueueSVMFree,
+ clEnqueueSVMMemcpy,
+ clEnqueueSVMMemFill,
+ clEnqueueSVMMap,
+ clEnqueueSVMUnmap,
+ clCreateSamplerWithProperties,
+ clSetKernelArgSVMPointer,
+ clSetKernelExecInfo,
+ clGetKernelSubGroupInfoKHR,
#endif
};
diff --git a/src/cl_khr_icd.h b/src/cl_khr_icd.h
index 3985d80..58cee68 100644
--- a/src/cl_khr_icd.h
+++ b/src/cl_khr_icd.h
@@ -21,13 +21,11 @@
#define SET_ICD(dispatch) \
dispatch = &cl_khr_icd_dispatch;
-#define INIT_ICD(member) .member = &cl_khr_icd_dispatch,
#define DEFINE_ICD(member) struct _cl_icd_dispatch const *member;
extern struct _cl_icd_dispatch const cl_khr_icd_dispatch;
#else
#define SET_ICD(dispatch)
-#define INIT_ICD(member)
#define DEFINE_ICD(member)
#endif
diff --git a/src/cl_mem.c b/src/cl_mem.c
index ad1c8c2..0278b7f 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -28,6 +28,7 @@
#include "cl_kernel.h"
#include "cl_command_queue.h"
#include "cl_cmrt.h"
+#include "cl_enqueue.h"
#include "CL/cl.h"
#include "CL/cl_intel.h"
@@ -35,6 +36,7 @@
#include <stdio.h>
#include <string.h>
#include <unistd.h>
+#include <math.h>
#define FIELD_SIZE(CASE,TYPE) \
case JOIN(CL_,CASE): \
@@ -48,7 +50,7 @@
#define MAX_TILING_SIZE 128 * MB
-static cl_mem_object_type
+LOCAL cl_mem_object_type
cl_get_mem_object_type(cl_mem mem)
{
switch (mem->type) {
@@ -67,166 +69,39 @@ cl_get_mem_object_type(cl_mem mem)
}
LOCAL cl_int
-cl_get_mem_object_info(cl_mem mem,
- cl_mem_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
+cl_get_pipe_info(cl_mem mem,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
{
+ _cl_mem_pipe *pipe;
switch(param_name)
{
- FIELD_SIZE(MEM_TYPE, cl_mem_object_type);
- FIELD_SIZE(MEM_FLAGS, cl_mem_flags);
- FIELD_SIZE(MEM_SIZE, size_t);
- FIELD_SIZE(MEM_HOST_PTR, void *);
- FIELD_SIZE(MEM_MAP_COUNT, cl_uint);
- FIELD_SIZE(MEM_REFERENCE_COUNT, cl_uint);
- FIELD_SIZE(MEM_CONTEXT, cl_context);
- FIELD_SIZE(MEM_ASSOCIATED_MEMOBJECT, cl_mem);
- FIELD_SIZE(MEM_OFFSET, size_t);
+ FIELD_SIZE(PIPE_PACKET_SIZE, cl_uint);
+ FIELD_SIZE(PIPE_MAX_PACKETS, cl_uint);
default:
return CL_INVALID_VALUE;
}
- switch(param_name)
- {
- case CL_MEM_TYPE:
- *((cl_mem_object_type *)param_value) = cl_get_mem_object_type(mem);
- break;
- case CL_MEM_FLAGS:
- *((cl_mem_flags *)param_value) = mem->flags;
- break;
- case CL_MEM_SIZE:
- *((size_t *)param_value) = mem->size;
- break;
- case CL_MEM_HOST_PTR:
- if(mem->type == CL_MEM_IMAGE_TYPE) {
- *((size_t *)param_value) = (size_t)mem->host_ptr;
- } else {
- struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
- *((size_t *)param_value) = (size_t)mem->host_ptr + buf->sub_offset;
- }
- break;
- case CL_MEM_MAP_COUNT:
- *((cl_uint *)param_value) = mem->map_ref;
- break;
- case CL_MEM_REFERENCE_COUNT:
- *((cl_uint *)param_value) = mem->ref_n;
- break;
- case CL_MEM_CONTEXT:
- *((cl_context *)param_value) = mem->ctx;
- break;
- case CL_MEM_ASSOCIATED_MEMOBJECT:
- if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
- *((cl_mem *)param_value) = NULL;
- } else {
- struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
- *((cl_mem *)param_value) = (cl_mem)(buf->parent);
- }
- break;
- case CL_MEM_OFFSET:
- if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
- *((size_t *)param_value) = 0;
- } else {
- struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
- *((size_t *)param_value) = buf->sub_offset;
- }
- break;
- }
-
- return CL_SUCCESS;
-}
-
-#define IS_1D(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D || \
- image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
- image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
-
-#define IS_2D(image) (image->image_type == CL_MEM_OBJECT_IMAGE2D || \
- image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
-
-#define IS_3D(image) (image->image_type == CL_MEM_OBJECT_IMAGE3D)
-
-#define IS_ARRAY(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
- image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ if(mem->type != CL_MEM_PIPE_TYPE)
+ return CL_INVALID_MEM_OBJECT;
-LOCAL cl_int
-cl_get_image_info(cl_mem mem,
- cl_image_info param_name,
- size_t param_value_size,
- void *param_value,
- size_t *param_value_size_ret)
-{
- int err;
- CHECK_IMAGE(mem, image);
+ pipe = cl_mem_pipe(mem);
switch(param_name)
{
- FIELD_SIZE(IMAGE_FORMAT, cl_image_format);
- FIELD_SIZE(IMAGE_ELEMENT_SIZE, size_t);
- FIELD_SIZE(IMAGE_ROW_PITCH, size_t);
- FIELD_SIZE(IMAGE_SLICE_PITCH, size_t);
- FIELD_SIZE(IMAGE_WIDTH, size_t);
- FIELD_SIZE(IMAGE_HEIGHT, size_t);
- FIELD_SIZE(IMAGE_DEPTH, size_t);
- FIELD_SIZE(IMAGE_ARRAY_SIZE, size_t);
- FIELD_SIZE(IMAGE_BUFFER, cl_mem);
- FIELD_SIZE(IMAGE_NUM_MIP_LEVELS, cl_uint);
- FIELD_SIZE(IMAGE_NUM_SAMPLES, cl_uint);
- default:
- return CL_INVALID_VALUE;
- }
-
- switch(param_name)
- {
- case CL_IMAGE_FORMAT:
- *(cl_image_format *)param_value = image->fmt;
- break;
- case CL_IMAGE_ELEMENT_SIZE:
- *(size_t *)param_value = image->bpp;
- break;
- case CL_IMAGE_ROW_PITCH:
- *(size_t *)param_value = image->row_pitch;
- break;
- case CL_IMAGE_SLICE_PITCH:
- *(size_t *)param_value = image->slice_pitch;
- break;
- case CL_IMAGE_WIDTH:
-
- if (mem->type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
- struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image*) image;
- *(size_t *)param_value = buffer1d_image->size;
- } else
- *(size_t *)param_value = image->w;
- break;
- case CL_IMAGE_HEIGHT:
- if (mem->type == CL_MEM_BUFFER1D_IMAGE_TYPE)
- *(size_t *)param_value = 0;
- else
- *(size_t *)param_value = IS_1D(image) ? 0 : image->h;
- break;
- case CL_IMAGE_DEPTH:
- *(size_t *)param_value = IS_3D(image) ? image->depth : 0;
+ case CL_PIPE_PACKET_SIZE:
+ *((cl_uint *)param_value) = pipe->packet_size;
break;
- case CL_IMAGE_ARRAY_SIZE:
- *(size_t *)param_value = IS_ARRAY(image) ? image->depth : 0;
- break;
- case CL_IMAGE_BUFFER:
- *(cl_mem *)param_value = image->buffer_1d;
- break;
- case CL_IMAGE_NUM_MIP_LEVELS:
- case CL_IMAGE_NUM_SAMPLES:
- *(cl_mem *)param_value = 0;
+ case CL_PIPE_MAX_PACKETS:
+ *((cl_uint *)param_value) = pipe->max_packets;
break;
}
return CL_SUCCESS;
-
-error:
- return err;
}
-#undef FIELD_SIZE
-
LOCAL cl_mem
cl_mem_allocate(enum cl_mem_type type,
cl_context ctx,
@@ -257,18 +132,23 @@ cl_mem_allocate(enum cl_mem_type type,
struct _cl_mem_buffer1d_image *buffer1d_image = NULL;
TRY_ALLOC(buffer1d_image, CALLOC(struct _cl_mem_buffer1d_image));
mem = &buffer1d_image->base.base;
+ } else if (type == CL_MEM_PIPE_TYPE) {
+ _cl_mem_pipe *pipe = NULL;
+ TRY_ALLOC(pipe, CALLOC(struct _cl_mem_pipe));
+ mem = &pipe->base;
} else {
struct _cl_mem_buffer *buffer = NULL;
TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer));
mem = &buffer->base;
}
+
+ CL_OBJECT_INIT_BASE(mem, CL_OBJECT_MEM_MAGIC);
+ list_init(&mem->dstr_cb_head);
mem->type = type;
- SET_ICD(mem->dispatch)
- mem->ref_n = 1;
- mem->magic = CL_MAGIC_MEM_HEADER;
mem->flags = flags;
mem->is_userptr = 0;
mem->offset = 0;
+ mem->is_svm = 0;
mem->cmrt_mem = NULL;
if (mem->type == CL_MEM_IMAGE_TYPE) {
cl_mem_image(mem)->is_image_from_buffer = 0;
@@ -285,17 +165,26 @@ cl_mem_allocate(enum cl_mem_type type,
#ifdef HAS_USERPTR
uint8_t bufCreated = 0;
- if (ctx->device->host_unified_memory) {
+ if (ctx->devices[0]->host_unified_memory) {
int page_size = getpagesize();
int cacheline_size = 0;
- cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+ cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
if (type == CL_MEM_BUFFER_TYPE) {
if (flags & CL_MEM_USE_HOST_PTR) {
assert(host_ptr != NULL);
+ cl_mem svm_mem = NULL;
+ if((svm_mem = cl_context_get_svm_from_ptr(ctx, host_ptr)) != NULL)
+ mem->is_svm = 1;
/* userptr not support tiling */
if (!is_tiled) {
- if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr) &&
+ if(svm_mem != NULL) { //SVM always paged alignment
+ mem->offset = 0;
+ mem->is_userptr = 1;
+ mem->bo = svm_mem->bo;
+ cl_mem_add_ref(svm_mem);
+ bufCreated = 1;
+ } else if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr) &&
(ALIGN((unsigned long)sz, cacheline_size) == (unsigned long)sz)) {
void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1)));
mem->offset = host_ptr - aligned_host_ptr;
@@ -333,7 +222,7 @@ cl_mem_allocate(enum cl_mem_type type,
// if create image from USE_HOST_PTR buffer, the buffer's base address need be aligned.
if(buffer->is_userptr) {
int base_alignement = 0;
- cl_get_device_info(ctx->device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL);
+ cl_get_device_info(ctx->devices[0], CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL);
if(ALIGN((unsigned long)buffer->host_ptr, base_alignement) != (unsigned long)buffer->host_ptr) {
err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
goto error;
@@ -363,15 +252,8 @@ cl_mem_allocate(enum cl_mem_type type,
mem->size = sz;
}
- cl_context_add_ref(ctx);
- mem->ctx = ctx;
- /* Append the buffer in the context buffer list */
- pthread_mutex_lock(&ctx->buffer_lock);
- mem->next = ctx->buffers;
- if (ctx->buffers != NULL)
- ctx->buffers->prev = mem;
- ctx->buffers = mem;
- pthread_mutex_unlock(&ctx->buffer_lock);
+ /* Append the buffer in the context buffer list */
+ cl_context_add_mem(ctx, mem);
exit:
if (errcode)
@@ -385,17 +267,26 @@ error:
}
LOCAL cl_int
-is_valid_mem(cl_mem mem, cl_mem buffers)
+cl_mem_is_valid(cl_mem mem, cl_context ctx)
{
- cl_mem tmp = buffers;
- while(tmp){
- if(mem == tmp){
- if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+ struct list_node *pos;
+ cl_base_object pbase_object;
+
+ CL_OBJECT_LOCK(ctx);
+ list_for_each (pos, (&ctx->mem_objects)) {
+ pbase_object = list_entry(pos, _cl_base_object, node);
+ if (pbase_object == (cl_base_object)mem) {
+ if (UNLIKELY(!CL_OBJECT_IS_MEM(mem))) {
+ CL_OBJECT_UNLOCK(ctx);
return CL_INVALID_MEM_OBJECT;
+ }
+
+ CL_OBJECT_UNLOCK(ctx);
return CL_SUCCESS;
}
- tmp = tmp->next;
}
+
+ CL_OBJECT_UNLOCK(ctx);
return CL_INVALID_MEM_OBJECT;
}
@@ -448,7 +339,7 @@ cl_mem_new_buffer(cl_context ctx,
goto error;
}
- if ((err = cl_get_device_info(ctx->device,
+ if ((err = cl_get_device_info(ctx->devices[0],
CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(max_mem_size),
&max_mem_size,
@@ -550,7 +441,7 @@ cl_mem_new_sub_buffer(cl_mem buffer,
goto error;
}
- if (info->origin & (buffer->ctx->device->mem_base_addr_align / 8 - 1)) {
+ if (info->origin & (buffer->ctx->devices[0]->mem_base_addr_align / 8 - 1)) {
err = CL_MISALIGNED_SUB_BUFFER_OFFSET;
goto error;
}
@@ -558,10 +449,10 @@ cl_mem_new_sub_buffer(cl_mem buffer,
/* Now create the sub buffer and link it to the buffer. */
TRY_ALLOC (sub_buf, CALLOC(struct _cl_mem_buffer));
mem = &sub_buf->base;
+
+ CL_OBJECT_INIT_BASE(mem, CL_OBJECT_MEM_MAGIC);
+ list_init(&mem->dstr_cb_head);
mem->type = CL_MEM_SUBBUFFER_TYPE;
- SET_ICD(mem->dispatch)
- mem->ref_n = 1;
- mem->magic = CL_MAGIC_MEM_HEADER;
mem->flags = flags;
mem->offset = buffer->offset;
mem->is_userptr = buffer->is_userptr;
@@ -579,19 +470,12 @@ cl_mem_new_sub_buffer(cl_mem buffer,
mem->bo = buffer->bo;
mem->size = info->size;
sub_buf->sub_offset = info->origin;
- if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR) {
+ if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR || buffer->flags & CL_MEM_ALLOC_HOST_PTR) {
mem->host_ptr = buffer->host_ptr;
}
- cl_context_add_ref(buffer->ctx);
- mem->ctx = buffer->ctx;
/* Append the buffer in the context buffer list */
- pthread_mutex_lock(&buffer->ctx->buffer_lock);
- mem->next = buffer->ctx->buffers;
- if (buffer->ctx->buffers != NULL)
- buffer->ctx->buffers->prev = mem;
- buffer->ctx->buffers = mem;
- pthread_mutex_unlock(&buffer->ctx->buffer_lock);
+ cl_context_add_mem(buffer->ctx, mem);
exit:
if (errcode_ret)
@@ -603,6 +487,68 @@ error:
goto exit;
}
+cl_mem cl_mem_new_pipe(cl_context ctx,
+ cl_mem_flags flags,
+ cl_uint packet_size,
+ cl_uint max_packets,
+ cl_int *errcode_ret)
+{
+ _cl_mem_pipe* pipe = NULL;
+ cl_uint *ptr = NULL;
+ cl_mem mem = NULL;
+ cl_int err;
+ cl_uint sz;
+ if(UNLIKELY((pipe = CALLOC(_cl_mem_pipe)) == NULL)) {
+ err = CL_OUT_OF_RESOURCES;
+ goto error;
+ }
+
+ sz = packet_size * max_packets;
+ assert(sz != 0);
+
+ /* HSW: Byte scattered Read/Write has limitation that
+ the buffer size must be a multiple of 4 bytes. */
+ sz = ALIGN(sz, 4);
+
+ sz += 128; //The head of pipe is for data struct, and alignment to 128 byte for max data type double16
+
+ mem = cl_mem_allocate(CL_MEM_PIPE_TYPE, ctx, flags, sz, CL_FALSE,NULL , NULL, &err);
+
+ if (mem == NULL || err != CL_SUCCESS)
+ goto error;
+
+ ptr = cl_mem_map_auto(mem, 1);
+ if(ptr == NULL){
+ err = CL_OUT_OF_RESOURCES;
+ goto error;
+ }
+ ptr[0] = max_packets;
+ ptr[1] = packet_size;
+ ptr[2] = 0; //write ptr
+ ptr[3] = 0; //read ptr
+ ptr[4] = 0; //reservation read ptr
+ ptr[5] = 0; //reservation write ptr
+ ptr[6] = 0; //packet num
+ cl_mem_unmap(mem);
+
+ pipe = cl_mem_pipe(mem);
+ pipe->flags = flags;
+ pipe->packet_size = packet_size;
+ pipe->max_packets = max_packets;
+
+ return mem;
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+
+}
+
void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
{
cl_buffer_unreference(buffer->bo);
@@ -620,6 +566,81 @@ void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
}
}
+void* cl_mem_svm_allocate(cl_context ctx, cl_svm_mem_flags flags,
+ size_t size, unsigned int alignment)
+{
+ cl_int err = CL_SUCCESS;
+ size_t max_mem_size;
+
+ if(UNLIKELY(alignment & (alignment - 1)))
+ return NULL;
+
+ if ((err = cl_get_device_info(ctx->devices[0],
+ CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+ sizeof(max_mem_size),
+ &max_mem_size,
+ NULL)) != CL_SUCCESS) {
+ return NULL;
+ }
+
+ if(UNLIKELY(size == 0 || size > max_mem_size)) {
+ return NULL;
+ }
+
+ if (flags & (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS)) {
+ return NULL;
+ }
+ if (flags && ((flags & (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_FINE_GRAIN_BUFFER))
+ || ((flags & CL_MEM_WRITE_ONLY) && (flags & CL_MEM_READ_ONLY))
+ || ((flags & CL_MEM_WRITE_ONLY) && (flags & CL_MEM_READ_WRITE))
+ || ((flags & CL_MEM_READ_ONLY) && (flags & CL_MEM_READ_WRITE)))) {
+ return NULL;
+ }
+
+ void * ptr = NULL;
+#ifdef HAS_BO_SET_SOFTPIN
+ cl_buffer_mgr bufmgr = NULL;
+ cl_mem mem;
+ _cl_mem_svm* svm;
+ if(UNLIKELY((svm = CALLOC(_cl_mem_svm)) == NULL))
+ return NULL;
+ mem = &svm->base;
+
+ mem->type = CL_MEM_SVM_TYPE;
+ CL_OBJECT_INIT_BASE(mem, CL_OBJECT_MEM_MAGIC);
+ list_init(&mem->dstr_cb_head);
+ mem->flags = flags | CL_MEM_USE_HOST_PTR;
+ mem->is_userptr = 0;
+ mem->is_svm = 0;
+ mem->offset = 0;
+
+ bufmgr = cl_context_get_bufmgr(ctx);
+ assert(bufmgr);
+
+ int page_size = getpagesize();
+ const size_t alignedSZ = ALIGN(size, page_size);
+ if(alignment == 0)
+ alignment = page_size;
+ else
+ alignment = ALIGN(alignment, page_size);
+ ptr = cl_aligned_malloc(alignedSZ, alignment);
+ if(ptr == NULL) return NULL;
+
+ mem->host_ptr = ptr;
+ mem->is_svm = 1;
+ mem->is_userptr = 1;
+ mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL SVM memory object", ptr, alignedSZ, 0);
+ mem->size = size;
+ cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr);
+ cl_buffer_set_bo_use_full_range(mem->bo, 1);
+
+ /* Append the svm in the context buffer list */
+ cl_context_add_mem(ctx, mem);
+#endif
+
+ return ptr;
+}
+
void
cl_mem_copy_image_region(const size_t *origin, const size_t *region,
void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
@@ -790,7 +811,7 @@ _cl_mem_new_image(cl_context ctx,
h = 1;
depth = 1;
- if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
+ if (UNLIKELY(w > ctx->devices[0]->image2d_max_width)) DO_IMAGE_ERROR;
if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
if (UNLIKELY(data && (slice_pitch % pitch != 0))) DO_IMAGE_ERROR;
if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
@@ -800,11 +821,11 @@ _cl_mem_new_image(cl_context ctx,
image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
if (image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
- if (UNLIKELY(w > ctx->device->image_mem_size)) DO_IMAGE_ERROR;
+ if (UNLIKELY(w > ctx->devices[0]->image_mem_size)) DO_IMAGE_ERROR;
/* This is an image1d buffer which exceeds normal image size restrication
We have to use a 2D image to simulate this 1D image. */
- h = (w + ctx->device->image2d_max_width - 1) / ctx->device->image2d_max_width;
- w = w > ctx->device->image2d_max_width ? ctx->device->image2d_max_width : w;
+ h = (w + ctx->devices[0]->image2d_max_width - 1) / ctx->devices[0]->image2d_max_width;
+ w = w > ctx->devices[0]->image2d_max_width ? ctx->devices[0]->image2d_max_width : w;
tiling = CL_NO_TILE;
} else if(image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL) {
tiling = CL_NO_TILE;
@@ -817,8 +838,8 @@ _cl_mem_new_image(cl_context ctx,
if (data && pitch == 0)
pitch = min_pitch;
- if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
- if (UNLIKELY(h > ctx->device->image2d_max_height)) DO_IMAGE_ERROR;
+ if (UNLIKELY(w > ctx->devices[0]->image2d_max_width)) DO_IMAGE_ERROR;
+ if (UNLIKELY(h > ctx->devices[0]->image2d_max_height)) DO_IMAGE_ERROR;
if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
if (UNLIKELY(!data && pitch != 0 && buffer == NULL)) DO_IMAGE_ERROR;
@@ -838,11 +859,11 @@ _cl_mem_new_image(cl_context ctx,
size_t min_slice_pitch = pitch * h;
if (data && slice_pitch == 0)
slice_pitch = min_slice_pitch;
- if (UNLIKELY(w > ctx->device->image3d_max_width)) DO_IMAGE_ERROR;
- if (UNLIKELY(h > ctx->device->image3d_max_height)) DO_IMAGE_ERROR;
+ if (UNLIKELY(w > ctx->devices[0]->image3d_max_width)) DO_IMAGE_ERROR;
+ if (UNLIKELY(h > ctx->devices[0]->image3d_max_height)) DO_IMAGE_ERROR;
if (image_type == CL_MEM_OBJECT_IMAGE3D &&
- (UNLIKELY(depth > ctx->device->image3d_max_depth))) DO_IMAGE_ERROR
- else if (UNLIKELY(depth > ctx->device->image_max_array_size)) DO_IMAGE_ERROR;
+ (UNLIKELY(depth > ctx->devices[0]->image3d_max_depth))) DO_IMAGE_ERROR
+ else if (UNLIKELY(depth > ctx->devices[0]->image_max_array_size)) DO_IMAGE_ERROR;
if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
if (UNLIKELY(data && min_slice_pitch > slice_pitch)) DO_IMAGE_ERROR;
if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
@@ -854,9 +875,9 @@ _cl_mem_new_image(cl_context ctx,
#undef DO_IMAGE_ERROR
uint8_t enableUserptr = 0;
- if (enable_true_hostptr && ctx->device->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
+ if (enable_true_hostptr && ctx->devices[0]->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
int cacheline_size = 0;
- cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+ cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data &&
ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)) == h &&
ALIGN(h * pitch * depth, cacheline_size) == h * pitch * depth && //h and pitch should same as aligned_h and aligned_pitch if enable userptr
@@ -1033,7 +1054,7 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
goto error;
}
- if ((err = cl_get_device_info(ctx->device,
+ if ((err = cl_get_device_info(ctx->devices[0],
CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
sizeof(max_size),
&max_size,
@@ -1103,6 +1124,8 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
memcpy(dst, src, mem_buffer->base.size);
cl_mem_unmap(image);
cl_mem_unmap(buffer);
+ struct _cl_mem_buffer1d_image* image_buffer = (struct _cl_mem_buffer1d_image*)image;
+ image_buffer->descbuffer = buffer;
}
else
assert(0);
@@ -1172,14 +1195,28 @@ cl_mem_new_image(cl_context context,
}
LOCAL void
+cl_mem_svm_delete(cl_context ctx, void *svm_pointer)
+{
+ cl_mem mem;
+ if(UNLIKELY(svm_pointer == NULL))
+ return;
+ mem = cl_context_get_svm_from_ptr(ctx, svm_pointer);
+ if(mem == NULL)
+ return;
+ cl_mem_delete(mem);
+}
+
+LOCAL void
cl_mem_delete(cl_mem mem)
{
cl_int i;
+ cl_mem_dstr_cb cb = NULL;
+
if (UNLIKELY(mem == NULL))
return;
- if (atomic_dec(&mem->ref_n) > 1)
+ if (CL_OBJECT_DEC_REF(mem) > 1)
return;
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL
if (UNLIKELY(IS_GL_IMAGE(mem))) {
cl_mem_gl_delete(cl_mem_gl_image(mem));
}
@@ -1190,6 +1227,14 @@ cl_mem_delete(cl_mem mem)
cmrt_destroy_memory(mem);
#endif
+ /* First, call all the callbacks registered by user. */
+ while (!list_empty(&mem->dstr_cb_head)) {
+ cb = list_entry(mem->dstr_cb_head.head_node.n, _cl_mem_dstr_cb, node);
+ list_node_del(&cb->node);
+ cb->pfn_notify(mem, cb->user_data);
+ cl_free(cb);
+ }
+
/* iff we are a image, delete the 1d buffer if has. */
if (IS_IMAGE(mem)) {
if (cl_mem_image(mem)->buffer_1d) {
@@ -1204,21 +1249,6 @@ cl_mem_delete(cl_mem mem)
}
}
- /* Remove it from the list */
- if (mem->ctx) {
- pthread_mutex_lock(&mem->ctx->buffer_lock);
- if (mem->prev)
- mem->prev->next = mem->next;
- if (mem->next)
- mem->next->prev = mem->prev;
- if (mem->ctx->buffers == mem)
- mem->ctx->buffers = mem->next;
- pthread_mutex_unlock(&mem->ctx->buffer_lock);
- cl_context_delete(mem->ctx);
- } else {
- assert((mem->prev == 0) && (mem->next == 0));
- }
-
/* Someone still mapped, unmap */
if(mem->map_ref > 0) {
assert(mem->mapped_ptr);
@@ -1234,16 +1264,6 @@ cl_mem_delete(cl_mem mem)
if (mem->mapped_ptr)
free(mem->mapped_ptr);
- if (mem->dstr_cb) {
- cl_mem_dstr_cb *cb = mem->dstr_cb;
- while (mem->dstr_cb) {
- cb = mem->dstr_cb;
- cb->pfn_notify(mem, cb->user_data);
- mem->dstr_cb = cb->next;
- free(cb);
- }
- }
-
/* Iff we are sub, do nothing for bo release. */
if (mem->type == CL_MEM_SUBBUFFER_TYPE) {
struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
@@ -1258,15 +1278,24 @@ cl_mem_delete(cl_mem mem)
buffer->parent->subs = buffer->sub_next;
pthread_mutex_unlock(&buffer->parent->sub_lock);
cl_mem_delete((cl_mem )(buffer->parent));
+ } else if (mem->is_svm && mem->type != CL_MEM_SVM_TYPE) {
+ cl_mem svm_mem = cl_context_get_svm_from_ptr(mem->ctx, mem->host_ptr);
+ if (svm_mem != NULL)
+ cl_mem_delete(svm_mem);
} else if (LIKELY(mem->bo != NULL)) {
cl_buffer_unreference(mem->bo);
}
- if (mem->is_userptr &&
+ /* Remove it from the list */
+ cl_context_remove_mem(mem->ctx, mem);
+
+ if ((mem->is_userptr &&
(mem->flags & CL_MEM_ALLOC_HOST_PTR) &&
- (mem->type != CL_MEM_SUBBUFFER_TYPE))
+ (mem->type != CL_MEM_SUBBUFFER_TYPE)) ||
+ (mem->is_svm && mem->type == CL_MEM_SVM_TYPE))
cl_free(mem->host_ptr);
+ CL_OBJECT_DESTROY_BASE(mem);
cl_free(mem);
}
@@ -1274,7 +1303,7 @@ LOCAL void
cl_mem_add_ref(cl_mem mem)
{
assert(mem);
- atomic_inc(&mem->ref_n);
+ CL_OBJECT_INC_REF(mem);
}
#define LOCAL_SZ_0 16
@@ -1282,7 +1311,7 @@ cl_mem_add_ref(cl_mem mem)
#define LOCAL_SZ_2 4
LOCAL cl_int
-cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
size_t src_offset, size_t dst_offset, size_t cb)
{
cl_int ret = CL_SUCCESS;
@@ -1335,7 +1364,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
return ret;
}
@@ -1376,7 +1406,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
return ret;
}
@@ -1406,7 +1437,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
return ret;
}
@@ -1438,7 +1470,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
return ret;
}
@@ -1450,7 +1483,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
}
LOCAL cl_int
-cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image* src_image,
+cl_image_fill(cl_command_queue queue, cl_event e, const void * pattern, struct _cl_mem_image* src_image,
const size_t * origin, const size_t * region)
{
cl_int ret = CL_SUCCESS;
@@ -1458,6 +1491,8 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image
size_t global_off[] = {0,0,0};
size_t global_sz[] = {1,1,1};
size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+ uint32_t savedIntelFmt = src_image->intel_fmt;
+
if(region[1] == 1) local_sz[1] = 1;
if(region[2] == 1) local_sz[2] = 1;
@@ -1503,7 +1538,24 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image
return CL_OUT_OF_RESOURCES;
cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
- cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
+ if(src_image->fmt.image_channel_order >= CL_sRGBA) {
+#define RGB2sRGB(linear) ( linear <= 0.0031308f )? ( 12.92f * linear ):( 1.055f * powf( linear, 1.0f/2.4f ) - 0.055f);
+ cl_image_format fmt;
+ float newpattern[4] = {0.0,0.0,0.0,((float*)pattern)[3]};
+ int i;
+ for(i = 0;i < 3; i++){
+ if(src_image->fmt.image_channel_order == CL_sRGBA) {
+ newpattern[i] = RGB2sRGB(((float*)pattern)[i]);
+ } else
+ newpattern[2-i] = RGB2sRGB(((float*)pattern)[i]);
+ }
+ cl_kernel_set_arg(ker, 1, sizeof(float)*4, newpattern);
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = CL_UNORM_INT8;
+ src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+#undef RGB2sRGB
+ } else
+ cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
@@ -1511,13 +1563,15 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image
cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
- ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, e, 3, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
+ src_image->intel_fmt = savedIntelFmt;
return ret;
}
LOCAL cl_int
-cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size,
cl_mem buffer, size_t offset, size_t size)
{
cl_int ret = CL_SUCCESS;
@@ -1614,13 +1668,14 @@ cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
if (is_128)
cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
return ret;
}
LOCAL cl_int
-cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
const size_t *src_origin, const size_t *dst_origin, const size_t *region,
size_t src_row_pitch, size_t src_slice_pitch,
size_t dst_row_pitch, size_t dst_slice_pitch) {
@@ -1635,7 +1690,7 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
cl_int size = region[0]*region[1]*region[2];
- ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
+ ret = cl_mem_copy(queue, NULL, src_buf, dst_buf,src_offset, dst_offset, size);
return ret;
}
@@ -1687,14 +1742,16 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
return ret;
}
LOCAL cl_int
-cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
- const size_t *src_origin, const size_t *dst_origin, const size_t *region) {
+cl_mem_kernel_copy_image(cl_command_queue queue, cl_event event, struct _cl_mem_image* src_image,
+ struct _cl_mem_image* dst_image, const size_t *src_origin,
+ const size_t *dst_origin, const size_t *region) {
cl_int ret;
cl_kernel ker = NULL;
size_t global_off[] = {0,0,0};
@@ -1722,7 +1779,9 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
if (fixupDataType) {
cl_image_format fmt;
- if (src_image->fmt.image_channel_order != CL_BGRA)
+ if (src_image->fmt.image_channel_order != CL_BGRA &&
+ src_image->fmt.image_channel_order != CL_sBGRA &&
+ src_image->fmt.image_channel_order != CL_sRGBA)
fmt.image_channel_order = src_image->fmt.image_channel_order;
else
fmt.image_channel_order = CL_RGBA;
@@ -1835,7 +1894,8 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
fail:
@@ -1848,7 +1908,7 @@ fail:
}
LOCAL cl_int
-cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer,
+cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event event, struct _cl_mem_image* image, cl_mem buffer,
const size_t *src_origin, const size_t dst_offset, const size_t *region) {
cl_int ret;
cl_kernel ker = NULL;
@@ -1937,7 +1997,8 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
fail:
@@ -1951,7 +2012,7 @@ fail:
LOCAL cl_int
-cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image,
+cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_event event, cl_mem buffer, struct _cl_mem_image* image,
const size_t src_offset, const size_t *dst_origin, const size_t *region) {
cl_int ret;
cl_kernel ker = NULL;
@@ -2037,7 +2098,8 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+ global_off, global_sz, global_sz, local_sz, local_sz);
cl_kernel_delete(ker);
image->intel_fmt = intel_fmt;
@@ -2326,3 +2388,103 @@ error:
mem = NULL;
goto exit;
}
+
+LOCAL cl_int
+cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
+ size_t size, const size_t *origin, const size_t *region)
+{
+ // TODO: Need to add MT safe logic.
+
+ cl_int slot = -1;
+ int err = CL_SUCCESS;
+ size_t sub_offset = 0;
+
+ if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ sub_offset = buffer->sub_offset;
+ }
+
+ ptr = (char*)ptr + offset + sub_offset;
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mem->host_ptr);
+ //only calc ptr here, will do memcpy in enqueue
+ *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
+ } else {
+ *mem_ptr = ptr;
+ }
+ /* Record the mapped address. */
+ if (!mem->mapped_ptr_sz) {
+ mem->mapped_ptr_sz = 16;
+ mem->mapped_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
+ if (!mem->mapped_ptr) {
+ cl_mem_unmap_auto(mem);
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = 0;
+ } else {
+ int i = 0;
+ for (; i < mem->mapped_ptr_sz; i++) {
+ if (mem->mapped_ptr[i].ptr == NULL) {
+ slot = i;
+ break;
+ }
+ }
+ if (i == mem->mapped_ptr_sz) {
+ cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
+ if (!new_ptr) {
+ cl_mem_unmap_auto(mem);
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ memcpy(new_ptr, mem->mapped_ptr,
+ mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = mem->mapped_ptr_sz;
+ mem->mapped_ptr_sz *= 2;
+ free(mem->mapped_ptr);
+ mem->mapped_ptr = new_ptr;
+ }
+ }
+ assert(slot != -1);
+ mem->mapped_ptr[slot].ptr = *mem_ptr;
+ mem->mapped_ptr[slot].v_ptr = ptr;
+ mem->mapped_ptr[slot].size = size;
+ if(origin) {
+ assert(region);
+ mem->mapped_ptr[slot].origin[0] = origin[0];
+ mem->mapped_ptr[slot].origin[1] = origin[1];
+ mem->mapped_ptr[slot].origin[2] = origin[2];
+ mem->mapped_ptr[slot].region[0] = region[0];
+ mem->mapped_ptr[slot].region[1] = region[1];
+ mem->mapped_ptr[slot].region[2] = region[2];
+ }
+ mem->map_ref++;
+error:
+ if (err != CL_SUCCESS)
+ *mem_ptr = NULL;
+ return err;
+}
+
+LOCAL cl_int
+cl_mem_set_destructor_callback(cl_mem memobj,
+ void(CL_CALLBACK *pfn_notify)(cl_mem, void *), void *user_data)
+{
+ cl_mem_dstr_cb cb = cl_calloc(1, sizeof(_cl_mem_dstr_cb));
+ if (cb == NULL) {
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ memset(cb, 0, sizeof(_cl_mem_dstr_cb));
+ list_node_init(&cb->node);
+ cb->pfn_notify = pfn_notify;
+ cb->user_data = user_data;
+
+ CL_OBJECT_LOCK(memobj);
+ list_add(&memobj->dstr_cb_head, &cb->node);
+ CL_OBJECT_UNLOCK(memobj);
+ return CL_SUCCESS;
+}
diff --git a/src/cl_mem.h b/src/cl_mem.h
index c8f256d..4764401 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -23,9 +23,12 @@
#include "cl_internals.h"
#include "cl_driver_type.h"
#include "CL/cl.h"
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
#include <assert.h>
#include <pthread.h>
+#if defined(HAS_GL_EGL)
+#include "EGL/egl.h"
+#endif
#ifndef CL_VERSION_1_2
#define CL_MEM_OBJECT_IMAGE1D 0x10F4
@@ -61,15 +64,18 @@ typedef struct _cl_mapped_ptr {
}cl_mapped_ptr;
typedef struct _cl_mem_dstr_cb {
- struct _cl_mem_dstr_cb * next;
- void (CL_CALLBACK *pfn_notify)(cl_mem memobj, void *user_data);
+ list_node node; /* Mem callback list node */
+ void(CL_CALLBACK *pfn_notify)(cl_mem memobj, void *user_data);
void *user_data;
-}cl_mem_dstr_cb;
+} _cl_mem_dstr_cb;
+typedef _cl_mem_dstr_cb* cl_mem_dstr_cb;
/* Used for buffers and images */
enum cl_mem_type {
CL_MEM_BUFFER_TYPE,
CL_MEM_SUBBUFFER_TYPE,
+ CL_MEM_PIPE_TYPE,
+ CL_MEM_SVM_TYPE,
CL_MEM_IMAGE_TYPE,
CL_MEM_GL_IMAGE_TYPE,
CL_MEM_BUFFER1D_IMAGE_TYPE
@@ -78,11 +84,8 @@ enum cl_mem_type {
#define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
typedef struct _cl_mem {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a memory object */
- cl_mem prev, next; /* We chain the memory buffers together */
+ _cl_base_object base;
enum cl_mem_type type;
- volatile int ref_n; /* This object is reference counted */
cl_buffer bo; /* Data in GPU memory */
size_t size; /* original request size, not alignment size, used in constant buffer */
cl_context ctx; /* Context it belongs to */
@@ -92,19 +95,45 @@ typedef struct _cl_mem {
int mapped_ptr_sz; /* The array size of mapped_ptr. */
int map_ref; /* The mapped count. */
uint8_t mapped_gtt; /* This object has mapped gtt, for unmap. */
- cl_mem_dstr_cb *dstr_cb; /* The destroy callback. */
- uint8_t is_userptr; /* CL_MEM_USE_HOST_PTR is enabled*/
+ list_head dstr_cb_head; /* All destroy callbacks. */
+ uint8_t is_userptr; /* CL_MEM_USE_HOST_PTR is enabled */
+ cl_bool is_svm; /* This object is svm */
size_t offset; /* offset of host_ptr to the page beginning, only for CL_MEM_USE_HOST_PTR*/
uint8_t cmrt_mem_type; /* CmBuffer, CmSurface2D, ... */
void* cmrt_mem;
} _cl_mem;
+#define CL_OBJECT_MEM_MAGIC 0x381a27b9ee6504dfLL
+#define CL_OBJECT_IS_MEM(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_MEM_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
+#define CL_OBJECT_IS_IMAGE(mem) ((mem && \
+ ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC && \
+ CL_OBJECT_GET_REF(mem) >= 1 && \
+ mem->type >= CL_MEM_IMAGE_TYPE))
+#define CL_OBJECT_IS_BUFFER(mem) ((mem && \
+ ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC && \
+ CL_OBJECT_GET_REF(mem) >= 1 && \
+ mem->type < CL_MEM_IMAGE_TYPE))
+
+typedef struct _cl_mem_pipe {
+ _cl_mem base;
+ cl_svm_mem_flags flags; /* Flags specified at the creation time */
+ uint32_t packet_size;
+ uint32_t max_packets;
+} _cl_mem_pipe;
+
+typedef struct _cl_mem_svm {
+ _cl_mem base;
+ cl_svm_mem_flags flags; /* Flags specified at the creation time */
+} _cl_mem_svm;
+
struct _cl_mem_image {
_cl_mem base;
cl_image_format fmt; /* only for images */
uint32_t intel_fmt; /* format to provide in the surface state */
- uint32_t bpp; /* number of bytes per pixel */
+ size_t bpp; /* number of bytes per pixel */
cl_mem_object_type image_type; /* only for images 1D/2D...*/
size_t w, h, depth; /* only for images (depth is only for 3D images) */
size_t row_pitch, slice_pitch;
@@ -118,16 +147,30 @@ struct _cl_mem_image {
struct _cl_mem_gl_image {
struct _cl_mem_image base;
- uint32_t target;
- int miplevel;
- uint32_t texture;
+ int fd;
+#if defined(HAS_GL_EGL)
+ EGLImage egl_image;
+#endif
};
struct _cl_mem_buffer1d_image {
struct _cl_mem_image base;
uint32_t size;
+ _cl_mem * descbuffer;
};
+#define IS_1D_IMAGE(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D || \
+ image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
+ image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+
+#define IS_2D_IMAGE(image) (image->image_type == CL_MEM_OBJECT_IMAGE2D || \
+ image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
+#define IS_3D_IMAGE(image) (image->image_type == CL_MEM_OBJECT_IMAGE3D)
+
+#define IS_IMAGE_ARRAY(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
+ image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
inline static void
cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
cl_mem_object_type image_type,
@@ -176,21 +219,18 @@ cl_mem_gl_image(cl_mem mem)
return (struct _cl_mem_gl_image*)mem;
}
-inline static struct _cl_mem_buffer *
-cl_mem_buffer(cl_mem mem)
+inline static struct _cl_mem_pipe *
+cl_mem_pipe(cl_mem mem)
{
- assert(!IS_IMAGE(mem));
- return (struct _cl_mem_buffer *)mem;
+ assert(mem->type == CL_MEM_PIPE_TYPE);
+ return (struct _cl_mem_pipe *)mem;
}
/* Query information about a memory object */
-extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
-
-/* Query information about an image */
-extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
+extern cl_mem_object_type cl_get_mem_object_type(cl_mem mem);
/* Query whether mem is in buffers */
-extern cl_int is_valid_mem(cl_mem mem, cl_mem buffers);
+extern cl_int cl_mem_is_valid(cl_mem mem, cl_context ctx);
/* Create a new memory object and initialize it with possible user data */
extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
@@ -198,6 +238,13 @@ extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*
/* Create a new sub memory object */
extern cl_mem cl_mem_new_sub_buffer(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *);
+extern cl_mem cl_mem_new_pipe(cl_context, cl_mem_flags, cl_uint, cl_uint, cl_int *);
+/* Query information about a pipe object */
+extern cl_int cl_get_pipe_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
+
+void* cl_mem_svm_allocate(cl_context, cl_svm_mem_flags, size_t, unsigned int);
+void cl_mem_svm_delete(cl_context, void *svm_pointer);
+
/* Idem but this is an image */
extern cl_mem
cl_mem_new_image(cl_context context,
@@ -217,30 +264,30 @@ extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
extern void cl_mem_add_ref(cl_mem);
/* api clEnqueueCopyBuffer help function */
-extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+extern cl_int cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
size_t src_offset, size_t dst_offset, size_t cb);
-extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+extern cl_int cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size,
cl_mem buffer, size_t offset, size_t size);
-extern cl_int cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image*,
+extern cl_int cl_image_fill(cl_command_queue queue, cl_event e, const void * pattern, struct _cl_mem_image*,
const size_t *, const size_t *);
/* api clEnqueueCopyBufferRect help function */
-extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_event event, cl_mem, cl_mem,
const size_t *, const size_t *, const size_t *,
size_t, size_t, size_t, size_t);
/* api clEnqueueCopyImage help function */
-extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct _cl_mem_image*, struct _cl_mem_image*,
- const size_t *, const size_t *, const size_t *);
+extern cl_int cl_mem_kernel_copy_image(cl_command_queue, cl_event event, struct _cl_mem_image*,
+ struct _cl_mem_image*, const size_t *, const size_t *, const size_t *);
/* api clEnqueueCopyImageToBuffer help function */
-extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct _cl_mem_image*, cl_mem,
+extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, cl_event, struct _cl_mem_image*, cl_mem,
const size_t *, const size_t, const size_t *);
/* api clEnqueueCopyBufferToImage help function */
-extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_mem_image*,
+extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_event, cl_mem, struct _cl_mem_image*,
const size_t, const size_t *, const size_t *);
/* Directly map a memory object */
@@ -314,5 +361,10 @@ extern cl_mem cl_mem_new_image_from_fd(cl_context ctx,
size_t row_pitch,
cl_int *errcode);
+extern cl_int cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
+ size_t size, const size_t *origin, const size_t *region);
+
+extern cl_int cl_mem_set_destructor_callback(cl_mem memobj,
+ void(CL_CALLBACK *pfn_notify)(cl_mem, void *), void *user_data);
#endif /* __CL_MEM_H__ */
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index b0b2c1b..fdad067 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -74,10 +74,6 @@ cl_mem_new_gl_texture(cl_context ctx,
goto error;
}
- cl_mem_gl_image(mem)->target = texture_target;
- cl_mem_gl_image(mem)->miplevel = miplevel;
- cl_mem_gl_image(mem)->texture = texture;
-
exit:
if (errcode_ret)
*errcode_ret = err;
@@ -92,6 +88,5 @@ error:
LOCAL void cl_mem_gl_delete(struct _cl_mem_gl_image *gl_image)
{
if (gl_image->base.base.bo != NULL)
- cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image->target,
- gl_image->miplevel, gl_image->texture);
+ cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image);
}
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index d7a1f68..1f21f5d 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -31,7 +31,6 @@
.JOIN(FIELD,_sz) = sizeof(STRING),
static struct _cl_platform_id intel_platform_data = {
- INIT_ICD(dispatch)
DECL_INFO_STRING(profile, "FULL_PROFILE")
DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
DECL_INFO_STRING(name, "Intel Gen OCL Driver")
@@ -51,6 +50,7 @@ cl_get_platform_default(void)
return intel_platform;
intel_platform = &intel_platform_data;
+ CL_OBJECT_INIT_BASE(intel_platform, CL_OBJECT_PLATFORM_MAGIC);
cl_intel_platform_extension_init(intel_platform);
return intel_platform;
}
@@ -69,54 +69,3 @@ cl_get_platform_ids(cl_uint num_entries,
return CL_SUCCESS;
}
-
-#define DECL_FIELD(CASE,FIELD) \
- case JOIN(CL_,CASE): \
- if (param_value_size < cl_get_platform_default()->JOIN(FIELD,_sz)) \
- return CL_INVALID_VALUE; \
- if (param_value_size_ret != NULL) \
- *param_value_size_ret = cl_get_platform_default()->JOIN(FIELD,_sz); \
- memcpy(param_value, \
- cl_get_platform_default()->FIELD, \
- cl_get_platform_default()->JOIN(FIELD,_sz)); \
- return CL_SUCCESS;
-
-#define GET_FIELD_SZ(CASE,FIELD) \
- case JOIN(CL_,CASE): \
- if (param_value_size_ret != NULL) \
- *param_value_size_ret = cl_get_platform_default()->JOIN(FIELD,_sz); \
- return CL_SUCCESS;
-
-LOCAL cl_int
-cl_get_platform_info(cl_platform_id platform,
- cl_platform_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret)
-{
- if (param_value == NULL) {
- switch (param_name) {
- GET_FIELD_SZ (PLATFORM_PROFILE, profile);
- GET_FIELD_SZ (PLATFORM_VERSION, version);
- GET_FIELD_SZ (PLATFORM_NAME, name);
- GET_FIELD_SZ (PLATFORM_VENDOR, vendor);
- GET_FIELD_SZ (PLATFORM_EXTENSIONS, extensions);
- GET_FIELD_SZ (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
- default: return CL_INVALID_VALUE;
- }
- }
-
- /* Fetch the platform inform */
- switch (param_name) {
- DECL_FIELD (PLATFORM_PROFILE, profile);
- DECL_FIELD (PLATFORM_VERSION, version);
- DECL_FIELD (PLATFORM_NAME, name);
- DECL_FIELD (PLATFORM_VENDOR, vendor);
- DECL_FIELD (PLATFORM_EXTENSIONS, extensions);
- DECL_FIELD (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
- default: return CL_INVALID_VALUE;
- }
-}
-
-#undef DECL_FIELD
-
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index 865317a..3fdb920 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -23,12 +23,12 @@
#include "CL/cl.h"
#include "cl_internals.h"
#include "cl_extensions.h"
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
#include "src/OCLConfig.h"
#include "src/git_sha1.h"
struct _cl_platform_id {
- DEFINE_ICD(dispatch)
+ _cl_base_object base;
const char *profile;
const char *version;
const char *name;
@@ -44,6 +44,11 @@ struct _cl_platform_id {
struct cl_extensions *internal_extensions;
};
+#define CL_OBJECT_PLATFORM_MAGIC 0xaacdbb00123ccd85LL
+#define CL_OBJECT_IS_PLATFORM(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_PLATFORM_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
+
/* Return the default platform */
extern cl_platform_id cl_get_platform_default(void);
@@ -52,13 +57,6 @@ extern cl_int cl_get_platform_ids(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms);
-/* Return information for the current platform */
-extern cl_int cl_get_platform_info(cl_platform_id platform,
- cl_platform_info param_name,
- size_t param_value_size,
- void * param_value,
- size_t * param_value_size_ret);
-
#define _STR(x) #x
#define _JOINT(x, y) _STR(x) "." _STR(y)
#define _JOINT3(x, y, z) _STR(x) "." _STR(y) "." _STR(z)
diff --git a/src/cl_program.c b/src/cl_program.c
index 17f64ca..0358705 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -66,7 +66,7 @@ cl_program_delete(cl_program p)
return;
/* We are not done with it yet */
- if ((ref = atomic_dec(&p->ref_n)) > 1) return;
+ if ((ref = CL_OBJECT_DEC_REF(p)) > 1) return;
/* Destroy the sources and binary if still allocated */
cl_program_release_sources(p);
@@ -83,17 +83,6 @@ cl_program_delete(cl_program p)
p->build_log = NULL;
}
- /* Remove it from the list */
- assert(p->ctx);
- pthread_mutex_lock(&p->ctx->program_lock);
- if (p->prev)
- p->prev->next = p->next;
- if (p->next)
- p->next->prev = p->prev;
- if (p->ctx->programs == p)
- p->ctx->programs = p->next;
- pthread_mutex_unlock(&p->ctx->program_lock);
-
#ifdef HAS_CMRT
if (p->cmrt_program != NULL)
cmrt_destroy_program(p);
@@ -106,8 +95,12 @@ cl_program_delete(cl_program p)
cl_free(p->ker);
}
- /* Program belongs to their parent context */
- cl_context_delete(p->ctx);
+ if (p->global_data_ptr)
+ cl_buffer_unreference(p->global_data);
+ cl_free(p->global_data_ptr);
+
+ /* Remove it from the list */
+ cl_context_remove_program(p->ctx, p);
/* Free the program as allocated by the compiler */
if (p->opaque) {
@@ -120,7 +113,7 @@ cl_program_delete(cl_program p)
interp_program_delete(p->opaque);
}
- p->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ CL_OBJECT_DESTROY_BASE(p);
cl_free(p);
}
@@ -132,17 +125,15 @@ cl_program_new(cl_context ctx)
/* Allocate the structure */
TRY_ALLOC_NO_ERR (p, CALLOC(struct _cl_program));
- SET_ICD(p->dispatch)
+ CL_OBJECT_INIT_BASE(p, CL_OBJECT_PROGRAM_MAGIC);
p->build_status = CL_BUILD_NONE;
- p->ref_n = 1;
- p->magic = CL_MAGIC_PROGRAM_HEADER;
- p->ctx = ctx;
p->cmrt_program = NULL;
p->build_log = calloc(BUILD_LOG_MAX_SIZE, sizeof(char));
if (p->build_log)
p->build_log_max_sz = BUILD_LOG_MAX_SIZE;
+
/* The queue also belongs to its context */
- cl_context_add_ref(ctx);
+ cl_context_add_program(ctx, p);
exit:
return p;
@@ -155,7 +146,7 @@ LOCAL void
cl_program_add_ref(cl_program p)
{
assert(p);
- atomic_inc(&p->ref_n);
+ CL_OBJECT_INC_REF(p);
}
static cl_int
@@ -217,6 +208,51 @@ LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX ind
#define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
#define isCMRT(BufPtr) headerCompare(BufPtr, BHI_CMRT)
+static cl_int get_program_global_data(cl_program prog) {
+//OpenCL 1.2 would never call this function, and OpenCL 2.0 alwasy HAS_BO_SET_SOFTPIN.
+#ifdef HAS_BO_SET_SOFTPIN
+ cl_buffer_mgr bufmgr = NULL;
+ bufmgr = cl_context_get_bufmgr(prog->ctx);
+ assert(bufmgr);
+ size_t const_size = interp_program_get_global_constant_size(prog->opaque);
+ if (const_size == 0) return CL_SUCCESS;
+
+ int page_size = getpagesize();
+ size_t alignedSz = ALIGN(const_size, page_size);
+ char * p = (char*)cl_aligned_malloc(alignedSz, page_size);
+ prog->global_data_ptr = p;
+ interp_program_get_global_constant_data(prog->opaque, (char*)p);
+
+ prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global data", p, alignedSz, 0);
+ cl_buffer_set_softpin_offset(prog->global_data, (size_t)p);
+ cl_buffer_set_bo_use_full_range(prog->global_data, 1);
+
+ uint32_t reloc_count = interp_program_get_global_reloc_count(prog->opaque);
+ if (reloc_count > 0) {
+ uint32_t x;
+ struct RelocEntry {int refOffset; int defOffset;};
+ char *temp = (char*) malloc(reloc_count *sizeof(int)*2);
+ interp_program_get_global_reloc_table(prog->opaque, temp);
+ for (x = 0; x < reloc_count; x++) {
+ int ref_offset = ((struct RelocEntry *)temp)[x].refOffset;
+ *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset + (uint64_t)p;
+ }
+ free(temp);
+ }
+#if 0
+ int x = 0;
+ for (x = 0; x < const_size; x++) {
+ printf("offset %d data: %x\n", x, (unsigned)p[x]);
+ }
+#endif
+#endif
+ return CL_SUCCESS;
+}
+
+LOCAL size_t cl_program_get_global_variable_size(cl_program prog) {
+ return interp_program_get_global_constant_size(prog->opaque);
+}
+
LOCAL cl_program
cl_program_create_from_binary(cl_context ctx,
cl_uint num_devices,
@@ -232,7 +268,7 @@ cl_program_create_from_binary(cl_context ctx,
assert(ctx);
INVALID_DEVICE_IF (num_devices != 1);
INVALID_DEVICE_IF (devices == NULL);
- INVALID_DEVICE_IF (devices[0] != ctx->device);
+ INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
INVALID_VALUE_IF (binaries == NULL);
INVALID_VALUE_IF (lengths == NULL);
@@ -269,7 +305,7 @@ cl_program_create_from_binary(cl_context ctx,
TRY_ALLOC(typed_binary, cl_calloc(lengths[0]+1, sizeof(char)));
memcpy(typed_binary+1, binaries[0], lengths[0]);
*typed_binary = 1;
- program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->device_id, typed_binary, program->binary_sz+1);
+ program->opaque = compiler_program_new_from_llvm_binary(program->ctx->devices[0]->device_id, typed_binary, program->binary_sz+1);
cl_free(typed_binary);
if (UNLIKELY(program->opaque == NULL)) {
err = CL_INVALID_PROGRAM;
@@ -287,7 +323,7 @@ cl_program_create_from_binary(cl_context ctx,
err= CL_INVALID_BINARY;
goto error;
}
- program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->device_id, program->binary, program->binary_sz);
+ program->opaque = compiler_program_new_from_llvm_binary(program->ctx->devices[0]->device_id, program->binary, program->binary_sz);
if (UNLIKELY(program->opaque == NULL)) {
err = CL_INVALID_PROGRAM;
@@ -296,7 +332,7 @@ cl_program_create_from_binary(cl_context ctx,
program->source_type = FROM_LLVM;
}
else if (isGenBinary((unsigned char*)program->binary)) {
- program->opaque = interp_program_new_from_binary(program->ctx->device->device_id, program->binary, program->binary_sz);
+ program->opaque = interp_program_new_from_binary(program->ctx->devices[0]->device_id, program->binary, program->binary_sz);
if (UNLIKELY(program->opaque == NULL)) {
err = CL_INVALID_PROGRAM;
goto error;
@@ -338,7 +374,7 @@ cl_program_create_with_built_in_kernles(cl_context ctx,
assert(ctx);
INVALID_DEVICE_IF (num_devices != 1);
INVALID_DEVICE_IF (devices == NULL);
- INVALID_DEVICE_IF (devices[0] != ctx->device);
+ INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
cl_int binary_status = CL_SUCCESS;
extern char cl_internal_built_in_kernel_str[];
@@ -346,7 +382,7 @@ cl_program_create_with_built_in_kernles(cl_context ctx,
char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
- &ctx->device,
+ &ctx->devices[0],
(size_t*)&cl_internal_built_in_kernel_str_size,
(const unsigned char **)&p_built_in_kernel_str,
&binary_status, &err);
@@ -372,12 +408,12 @@ cl_program_create_with_built_in_kernles(cl_context ctx,
kernel = strtok_r( local_kernel_names, delims , &saveptr);
while( kernel != NULL ) {
- matched_kernel = strstr(ctx->device->built_in_kernels, kernel);
+ matched_kernel = strstr(ctx->devices[0]->built_in_kernels, kernel);
if(matched_kernel){
for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
assert(ctx->built_in_prgs->ker[i]);
const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
- if (strcmp(ker_name, kernel) == 0) {
+ if (ker_name != NULL && strcmp(ker_name, kernel) == 0) {
break;
}
}
@@ -412,7 +448,7 @@ cl_program_create_from_llvm(cl_context ctx,
assert(ctx);
INVALID_DEVICE_IF (num_devices != 1);
INVALID_DEVICE_IF (devices == NULL);
- INVALID_DEVICE_IF (devices[0] != ctx->device);
+ INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
INVALID_VALUE_IF (file_name == NULL);
program = cl_program_new(ctx);
@@ -421,7 +457,7 @@ cl_program_create_from_llvm(cl_context ctx,
goto error;
}
- program->opaque = compiler_program_new_from_llvm(ctx->device->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
+ program->opaque = compiler_program_new_from_llvm(ctx->devices[0]->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
if (UNLIKELY(program->opaque == NULL)) {
err = CL_INVALID_PROGRAM;
goto error;
@@ -503,7 +539,7 @@ static int check_cl_version_option(cl_program p, const char* options) {
const char* s = NULL;
int ver1 = 0;
int ver2 = 0;
- char version_str[64];
+ char version_str[64] = {0};
if (options && (s = strstr(options, "-cl-std="))) {
@@ -518,7 +554,7 @@ static int check_cl_version_option(cl_program p, const char* options) {
ver1 = (s[10] - '0') * 10 + (s[12] - '0');
- if (cl_get_device_info(p->ctx->device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
+ if (cl_get_device_info(p->ctx->devices[0], CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
version_str, NULL) != CL_SUCCESS)
return 0;
@@ -541,7 +577,7 @@ cl_program_build(cl_program p, const char *options)
int i = 0;
int copyed = 0;
- if (p->ref_n > 1) {
+ if (CL_OBJECT_GET_REF(p) > 1) {
err = CL_INVALID_OPERATION;
goto error;
}
@@ -586,7 +622,7 @@ cl_program_build(cl_program p, const char *options)
goto error;
}
- p->opaque = compiler_program_new_from_source(p->ctx->device->device_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+ p->opaque = compiler_program_new_from_source(p->ctx->devices[0]->device_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
if (UNLIKELY(p->opaque == NULL)) {
if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
err = CL_INVALID_BUILD_OPTIONS;
@@ -614,7 +650,7 @@ cl_program_build(cl_program p, const char *options)
/* Create all the kernels */
TRY (cl_program_load_gen_program, p);
} else if (p->source_type == FROM_BINARY && p->binary_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
- p->opaque = interp_program_new_from_binary(p->ctx->device->device_id, p->binary, p->binary_sz);
+ p->opaque = interp_program_new_from_binary(p->ctx->devices[0]->device_id, p->binary, p->binary_sz);
if (UNLIKELY(p->opaque == NULL)) {
err = CL_BUILD_PROGRAM_FAILURE;
goto error;
@@ -638,6 +674,9 @@ cl_program_build(cl_program p, const char *options)
memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
copyed += sz;
}
+ if ((err = get_program_global_data(p)) != CL_SUCCESS)
+ goto error;
+
p->is_built = 1;
p->build_status = CL_BUILD_SUCCESS;
return CL_SUCCESS;
@@ -706,7 +745,7 @@ cl_program_link(cl_context context,
goto error;
}
- p->opaque = compiler_program_new_gen_program(context->device->device_id, NULL, NULL, NULL);
+ p->opaque = compiler_program_new_gen_program(context->devices[0]->device_id, NULL, NULL, NULL);
for(i = 0; i < num_input_programs; i++) {
// if program create with llvm binary, need deserilize first to get module.
if(input_programs[i])
@@ -743,6 +782,10 @@ cl_program_link(cl_context context,
memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
copyed += sz;
}
+
+ if ((err = get_program_global_data(p)) != CL_SUCCESS)
+ goto error;
+
done:
if(p) p->is_built = 1;
if(p) p->build_status = CL_BUILD_SUCCESS;
@@ -768,7 +811,7 @@ cl_program_compile(cl_program p,
cl_int err = CL_SUCCESS;
int i = 0;
- if (p->ref_n > 1) {
+ if (CL_OBJECT_GET_REF(p) > 1) {
err = CL_INVALID_OPERATION;
goto error;
}
@@ -841,7 +884,7 @@ cl_program_compile(cl_program p,
}
}
- p->opaque = compiler_program_compile_from_source(p->ctx->device->device_id, p->source, temp_header_path,
+ p->opaque = compiler_program_compile_from_source(p->ctx->devices[0]->device_id, p->source, temp_header_path,
p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
char rm_path[255]="rm ";
@@ -902,7 +945,7 @@ cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
for (i = 0; i < p->ker_n; ++i) {
assert(p->ker[i]);
const char *ker_name = cl_kernel_get_name(p->ker[i]);
- if (strcmp(ker_name, name) == 0) {
+ if (ker_name != NULL && strcmp(ker_name, name) == 0) {
from = p->ker[i];
break;
}
@@ -961,10 +1004,13 @@ cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size
return;
}
- ker_name = cl_kernel_get_name(p->ker[i]);
- len = strlen(ker_name);
- if(names) {
- strncpy(names, cl_kernel_get_name(p->ker[0]), size - 1);
+ ker_name = cl_kernel_get_name(p->ker[0]);
+ if (ker_name != NULL)
+ len = strlen(ker_name);
+ else
+ len = 0;
+ if(names && ker_name) {
+ strncpy(names, ker_name, size - 1);
names[size - 1] = '\0';
if(size < len - 1) {
if(size_ret) *size_ret = size;
@@ -972,12 +1018,15 @@ cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size
}
size = size - len - 1; //sub \0
}
- if(size_ret) *size_ret = strlen(ker_name) + 1; //add NULL
+ if(size_ret) *size_ret = len + 1; //add NULL
for (i = 1; i < p->ker_n; ++i) {
ker_name = cl_kernel_get_name(p->ker[i]);
- len = strlen(ker_name);
- if(names) {
+ if (ker_name != NULL)
+ len = strlen(ker_name);
+ else
+ len = 0;
+ if(names && ker_name) {
strncat(names, ";", size);
if(size >= 1)
strncat(names, ker_name, size - 1);
diff --git a/src/cl_program.h b/src/cl_program.h
index b69e00c..6e8e84a 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -22,6 +22,7 @@
#include "cl_internals.h"
#include "cl_gbe_loader.h"
+#include "cl_base_object.h"
#include "CL/cl.h"
#include <stdint.h>
@@ -49,13 +50,13 @@ typedef enum _BINARY_HEADER_INDEX {
/* This maps an OCL file containing some kernels */
struct _cl_program {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a program */
- volatile int ref_n; /* We reference count this object */
+ _cl_base_object base;
gbe_program opaque; /* (Opaque) program as ouput by the compiler */
cl_kernel *ker; /* All kernels included by the OCL file */
cl_program prev, next; /* We chain the programs together */
cl_context ctx; /* Its parent context */
+ cl_buffer global_data;
+ char * global_data_ptr;
char *bin; /* The program copied verbatim */
size_t bin_sz; /* Its size in memory */
char *source; /* Program sources */
@@ -75,6 +76,11 @@ struct _cl_program {
void* cmrt_program; /* real type: CmProgram* */
};
+#define CL_OBJECT_PROGRAM_MAGIC 0x34562ab12789cdefLL
+#define CL_OBJECT_IS_PROGRAM(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_PROGRAM_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
+
/* Create a empty program */
extern cl_program cl_program_new(cl_context);
@@ -146,5 +152,7 @@ cl_program_get_kernel_names(cl_program p,
size_t size,
char *names,
size_t *size_ret);
+extern size_t
+cl_program_get_global_variable_size(cl_program p);
#endif /* __CL_PROGRAM_H__ */
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index 45c1fdf..d1e6dfe 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -71,44 +71,31 @@ int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
}
LOCAL cl_sampler
-cl_sampler_new(cl_context ctx,
- cl_bool normalized_coords,
- cl_addressing_mode address,
- cl_filter_mode filter,
- cl_int *errcode_ret)
+cl_create_sampler(cl_context ctx, cl_bool normalized_coords, cl_addressing_mode address,
+ cl_filter_mode filter, cl_int *errcode_ret)
{
cl_sampler sampler = NULL;
- cl_int err = CL_SUCCESS;
/* Allocate and inialize the structure itself */
- TRY_ALLOC (sampler, CALLOC(struct _cl_sampler));
- SET_ICD(sampler->dispatch)
- sampler->ref_n = 1;
- sampler->magic = CL_MAGIC_SAMPLER_HEADER;
+ sampler = cl_calloc(1, sizeof(_cl_sampler));
+ if (sampler == NULL) {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return NULL;
+ }
+
+ CL_OBJECT_INIT_BASE(sampler, CL_OBJECT_SAMPLER_MAGIC);
sampler->normalized_coords = normalized_coords;
sampler->address = address;
sampler->filter = filter;
/* Append the sampler in the context sampler list */
- pthread_mutex_lock(&ctx->sampler_lock);
- sampler->next = ctx->samplers;
- if (ctx->samplers != NULL)
- ctx->samplers->prev = sampler;
- ctx->samplers = sampler;
- pthread_mutex_unlock(&ctx->sampler_lock);
- sampler->ctx = ctx;
- cl_context_add_ref(ctx);
+ cl_context_add_sampler(ctx, sampler);
+ // TODO: May move it to other place, it's not a common sampler logic.
sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
-exit:
- if (errcode_ret)
- *errcode_ret = err;
+ *errcode_ret = CL_SUCCESS;
return sampler;
-error:
- cl_sampler_delete(sampler);
- sampler = NULL;
- goto exit;
}
LOCAL void
@@ -116,20 +103,11 @@ cl_sampler_delete(cl_sampler sampler)
{
if (UNLIKELY(sampler == NULL))
return;
- if (atomic_dec(&sampler->ref_n) > 1)
+ if (CL_OBJECT_DEC_REF(sampler) > 1)
return;
- assert(sampler->ctx);
- pthread_mutex_lock(&sampler->ctx->sampler_lock);
- if (sampler->prev)
- sampler->prev->next = sampler->next;
- if (sampler->next)
- sampler->next->prev = sampler->prev;
- if (sampler->ctx->samplers == sampler)
- sampler->ctx->samplers = sampler->next;
- pthread_mutex_unlock(&sampler->ctx->sampler_lock);
- cl_context_delete(sampler->ctx);
-
+ cl_context_remove_sampler(sampler->ctx, sampler);
+ CL_OBJECT_DESTROY_BASE(sampler);
cl_free(sampler);
}
@@ -137,6 +115,6 @@ LOCAL void
cl_sampler_add_ref(cl_sampler sampler)
{
assert(sampler);
- atomic_inc(&sampler->ref_n);
+ CL_OBJECT_INC_REF(sampler);
}
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
index fc4b7e7..ce06eb4 100644
--- a/src/cl_sampler.h
+++ b/src/cl_sampler.h
@@ -21,37 +21,32 @@
#define __CL_SAMPLER_H__
#include "CL/cl.h"
+#include "cl_base_object.h"
#include "../backend/src/ocl_common_defines.h"
#include <stdint.h>
/* How to access images */
-struct _cl_sampler {
- DEFINE_ICD(dispatch)
- uint64_t magic; /* To identify it as a sampler object */
- volatile int ref_n; /* This object is reference counted */
- cl_sampler prev, next; /* We chain the samplers in the allocator */
- cl_context ctx; /* Context it belongs to */
- cl_bool normalized_coords; /* Are coordinates normalized? */
- cl_addressing_mode address;/* CLAMP / REPEAT and so on... */
- cl_filter_mode filter; /* LINEAR / NEAREST mostly */
+typedef struct _cl_sampler {
+ _cl_base_object base;
+ cl_context ctx; /* Context it belongs to */
+ cl_bool normalized_coords; /* Are coordinates normalized? */
+ cl_addressing_mode address; /* CLAMP / REPEAT and so on... */
+ cl_filter_mode filter; /* LINEAR / NEAREST mostly */
uint32_t clkSamplerValue;
-};
+} _cl_sampler;
-/* Create a new sampler object */
-extern cl_sampler cl_sampler_new(cl_context,
- cl_bool,
- cl_addressing_mode,
- cl_filter_mode,
- cl_int *err);
+#define CL_OBJECT_SAMPLER_MAGIC 0x686a0ecba79ce32fLL
+#define CL_OBJECT_IS_SAMPLER(obj) ((obj && \
+ ((cl_base_object)obj)->magic == CL_OBJECT_SAMPLER_MAGIC && \
+ CL_OBJECT_GET_REF(obj) >= 1))
+/* Create a new sampler object */
+extern cl_sampler cl_create_sampler(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *err);
/* Unref the object and delete it if no more reference on it */
extern void cl_sampler_delete(cl_sampler);
-
/* Add one more reference to this object */
extern void cl_sampler_add_ref(cl_sampler);
-
/* set a sampler kernel argument */
int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler);
#endif /* __CL_SAMPLER_H__ */
-
diff --git a/src/cl_thread.c b/src/cl_thread.c
deleted file mode 100644
index 0780513..0000000
--- a/src/cl_thread.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- */
-#include <string.h>
-#include <stdio.h>
-
-#include "cl_thread.h"
-#include "cl_alloc.h"
-#include "cl_utils.h"
-
-/* Because the cl_command_queue can be used in several threads simultaneously but
- without add ref to it, we now handle it like this:
- Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it
- does not have a slot, assign it.
- The resources are keeped in queue private, and resize it if needed.
- When the thread exit, the slot will be set invalid.
- When queue released, all the resources will be released. If user still enqueue, flush
- or finish the queue after it has been released, the behavior is undefined.
- TODO: Need to shrink the slot map.
- */
-
-static int thread_array_num = 1;
-static int *thread_slot_map = NULL;
-static int thread_magic_num = 1;
-static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
-
-typedef struct _thread_spec_data {
- cl_gpgpu gpgpu ;
- int valid;
- void* thread_batch_buf;
- cl_event last_event;
- cl_event current_event;
- int thread_magic;
-} thread_spec_data;
-
-typedef struct _queue_thread_private {
- thread_spec_data** threads_data;
- int threads_data_num;
- pthread_mutex_t thread_data_lock;
-} queue_thread_private;
-
-static pthread_once_t key_once = PTHREAD_ONCE_INIT;
-static pthread_key_t thread_id_key;
-static pthread_key_t thread_magic_key;
-
-static void create_thread_key()
-{
- pthread_key_create(&thread_id_key, NULL);
- pthread_key_create(&thread_magic_key, NULL);
-}
-
-static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
-{
- queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
- thread_spec_data* spec = NULL;
- int i = 0;
- int *id = NULL, *magic = NULL;
-
- pthread_once(&key_once, create_thread_key);
- id = pthread_getspecific(thread_id_key);
- if(id == NULL) {
- id = (int *)malloc(sizeof(int));
- *id = -1;
- pthread_setspecific(thread_id_key, id);
- }
- magic = pthread_getspecific(thread_magic_key);
- if(magic == NULL) {
- magic = (int *)malloc(sizeof(int));
- *magic = -1;
- pthread_setspecific(thread_magic_key, magic);
- }
-
- if (*id == -1) {
- pthread_mutex_lock(&thread_queue_map_lock);
- for (i = 0; i < thread_array_num; i++) {
- if (thread_slot_map[i] == 0) {
- *id = i;
- break;
- }
- }
-
- if (i == thread_array_num) {
- thread_array_num *= 2;
- thread_slot_map = realloc(thread_slot_map, sizeof(int) * thread_array_num);
-
- if(thread_slot_map == NULL) {
- pthread_mutex_unlock(&thread_queue_map_lock);
- return NULL;
- }
-
- memset(thread_slot_map + thread_array_num/2, 0, sizeof(int) * (thread_array_num/2));
- *id = thread_array_num/2;
- }
-
- thread_slot_map[*id] = 1;
-
- *magic = thread_magic_num++;
- pthread_mutex_unlock(&thread_queue_map_lock);
- }
-
- pthread_mutex_lock(&thread_private->thread_data_lock);
- if (thread_array_num > thread_private->threads_data_num) {// just enlarge
- int old_num = thread_private->threads_data_num;
- thread_private->threads_data_num = thread_array_num;
- thread_private->threads_data = realloc(thread_private->threads_data,
- thread_private->threads_data_num * sizeof(void *));
-
- if(thread_private->threads_data == NULL) {
- pthread_mutex_unlock(&thread_private->thread_data_lock);
- return NULL;
- }
-
- memset(thread_private->threads_data + old_num, 0,
- sizeof(void*) * (thread_private->threads_data_num - old_num));
- }
-
- assert(*id != -1 && *id < thread_array_num);
- spec = thread_private->threads_data[*id];
- if (!spec && create) {
- spec = CALLOC(thread_spec_data);
- spec->thread_magic = *magic;
- thread_private->threads_data[*id] = spec;
- }
-
- pthread_mutex_unlock(&thread_private->thread_data_lock);
-
- return spec;
-}
-
-cl_event get_current_event(cl_command_queue queue)
-{
- thread_spec_data* spec = __create_thread_spec_data(queue, 1);
- int *magic = pthread_getspecific(thread_magic_key);
- assert(spec && magic && spec->thread_magic == *magic);
- return spec->current_event;
-}
-
-cl_event get_last_event(cl_command_queue queue)
-{
- thread_spec_data* spec = __create_thread_spec_data(queue, 1);
- int *magic = pthread_getspecific(thread_magic_key);
- assert(spec && magic && spec->thread_magic == *magic);
- return spec->last_event;
-}
-
-void set_current_event(cl_command_queue queue, cl_event e)
-{
- thread_spec_data* spec = __create_thread_spec_data(queue, 1);
- int *magic = pthread_getspecific(thread_magic_key);
- assert(spec && magic && spec->thread_magic == *magic);
- spec->current_event = e;
-}
-
-void set_last_event(cl_command_queue queue, cl_event e)
-{
- thread_spec_data* spec = __create_thread_spec_data(queue, 1);
- int *magic = pthread_getspecific(thread_magic_key);
- assert(spec && magic && spec->thread_magic == *magic);
- spec->last_event = e;
-}
-
-void* cl_thread_data_create(void)
-{
- queue_thread_private* thread_private = CALLOC(queue_thread_private);
-
- if (thread_private == NULL)
- return NULL;
-
- if (thread_slot_map == NULL) {
- pthread_mutex_lock(&thread_queue_map_lock);
- thread_slot_map = calloc(thread_array_num, sizeof(int));
- pthread_mutex_unlock(&thread_queue_map_lock);
-
- }
-
- pthread_mutex_init(&thread_private->thread_data_lock, NULL);
-
- pthread_mutex_lock(&thread_private->thread_data_lock);
- thread_private->threads_data = malloc(thread_array_num * sizeof(void *));
- memset(thread_private->threads_data, 0, sizeof(void*) * thread_array_num);
- thread_private->threads_data_num = thread_array_num;
- pthread_mutex_unlock(&thread_private->thread_data_lock);
-
- return thread_private;
-}
-
-cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
-{
- thread_spec_data* spec = __create_thread_spec_data(queue, 1);
- if(!spec)
- return NULL;
- int *magic = pthread_getspecific(thread_magic_key);
- assert(magic);
-
- if (!spec->thread_magic && spec->thread_magic != *magic) {
- //We may get the slot from last thread. So free the resource.
- spec->valid = 0;
- }
-
- if (!spec->valid) {
- if (spec->thread_batch_buf) {
- cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
- spec->thread_batch_buf = NULL;
- }
- if (spec->gpgpu) {
- cl_gpgpu_delete(spec->gpgpu);
- spec->gpgpu = NULL;
- }
- TRY_ALLOC_NO_ERR(spec->gpgpu, cl_gpgpu_new(queue->ctx->drv));
- spec->valid = 1;
- }
-
- error:
- return spec->gpgpu;
-}
-
-void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
-{
- thread_spec_data* spec = __create_thread_spec_data(queue, 1);
- int *magic = pthread_getspecific(thread_magic_key);
-
- assert(spec && magic && spec->thread_magic == *magic);
-
- if (spec->thread_batch_buf) {
- cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
- }
- spec->thread_batch_buf = buf;
-}
-
-void* cl_get_thread_batch_buf(cl_command_queue queue) {
- thread_spec_data* spec = __create_thread_spec_data(queue, 1);
- int *magic = pthread_getspecific(thread_magic_key);
-
- assert(spec && magic && spec->thread_magic == *magic);
-
- return spec->thread_batch_buf;
-}
-
-void cl_invalid_thread_gpgpu(cl_command_queue queue)
-{
- int *id = pthread_getspecific(thread_id_key);
- queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
- thread_spec_data* spec = NULL;
-
- pthread_mutex_lock(&thread_private->thread_data_lock);
- assert(id);
- spec = thread_private->threads_data[*id];
- assert(spec);
- pthread_mutex_unlock(&thread_private->thread_data_lock);
-
- if (!spec->valid) {
- return;
- }
-
- assert(spec->gpgpu);
- cl_gpgpu_delete(spec->gpgpu);
- spec->gpgpu = NULL;
- spec->valid = 0;
-}
-
-cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue)
-{
- int *id = pthread_getspecific(thread_id_key);
- queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
- thread_spec_data* spec = NULL;
-
- pthread_mutex_lock(&thread_private->thread_data_lock);
- assert(id);
- spec = thread_private->threads_data[*id];
- assert(spec);
- pthread_mutex_unlock(&thread_private->thread_data_lock);
-
- if (!spec->valid)
- return NULL;
-
- assert(spec->gpgpu);
- cl_gpgpu gpgpu = spec->gpgpu;
- spec->gpgpu = NULL;
- spec->valid = 0;
- return gpgpu;
-}
-
-/* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(cl_command_queue queue)
-{
- int i = 0;
- queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
- int threads_data_num;
- thread_spec_data** threads_data;
-
- pthread_mutex_lock(&thread_private->thread_data_lock);
- threads_data_num = thread_private->threads_data_num;
- threads_data = thread_private->threads_data;
- thread_private->threads_data_num = 0;
- thread_private->threads_data = NULL;
- pthread_mutex_unlock(&thread_private->thread_data_lock);
- cl_free(thread_private);
- queue->thread_data = NULL;
-
- for (i = 0; i < threads_data_num; i++) {
- if (threads_data[i] != NULL && threads_data[i]->thread_batch_buf) {
- cl_gpgpu_unref_batch_buf(threads_data[i]->thread_batch_buf);
- threads_data[i]->thread_batch_buf = NULL;
- }
-
- if (threads_data[i] != NULL && threads_data[i]->valid) {
- cl_gpgpu_delete(threads_data[i]->gpgpu);
- threads_data[i]->gpgpu = NULL;
- threads_data[i]->valid = 0;
- }
- cl_free(threads_data[i]);
- }
-
- cl_free(threads_data);
-}
diff --git a/src/cl_thread.h b/src/cl_thread.h
deleted file mode 100644
index d77526b..0000000
--- a/src/cl_thread.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#ifndef __CL_THREAD_H__
-#define __CL_THREAD_H__
-
-#include <pthread.h>
-#include "cl_internals.h"
-#include "cl_command_queue.h"
-
-/* Create the thread specific data. */
-void* cl_thread_data_create(void);
-
-/* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(cl_command_queue queue);
-
-/* Used to get the gpgpu struct of each thread. */
-cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
-
-/* Used to release the gpgpu struct of each thread. */
-void cl_invalid_thread_gpgpu(cl_command_queue queue);
-
-/* Used to set the batch buffer of each thread. */
-void cl_set_thread_batch_buf(cl_command_queue queue, void* buf);
-
-/* Used to get the batch buffer of each thread. */
-void* cl_get_thread_batch_buf(cl_command_queue queue);
-
-/* take current gpgpu from the thread gpgpu pool. */
-cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue);
-
-cl_event get_current_event(cl_command_queue queue);
-cl_event get_last_event(cl_command_queue queue);
-void set_current_event(cl_command_queue queue, cl_event e);
-void set_last_event(cl_command_queue queue, cl_event e);
-
-#endif /* __CL_THREAD_H__ */
diff --git a/src/cl_utils.c b/src/cl_utils.c
new file mode 100644
index 0000000..38de1ea
--- /dev/null
+++ b/src/cl_utils.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_utils.h"
+#include <string.h>
+#include <assert.h>
+
+LOCAL void
+list_node_insert_before(struct list_node *node, struct list_node *the_new)
+{
+ list_node *before_node = node->p;
+ the_new->p = before_node;
+ the_new->n = node;
+ node->p = the_new;
+ before_node->n = the_new;
+}
+
+LOCAL void
+list_node_insert_after(struct list_node *node, struct list_node *the_new)
+{
+ list_node *after_node = node->n;
+ the_new->n = after_node;
+ the_new->p = node;
+ node->n = the_new;
+ after_node->p = the_new;
+}
+
+LOCAL void
+list_move(struct list_head *the_old, struct list_head *the_new)
+{
+ assert(list_empty(the_new));
+ if (list_empty(the_old)) {
+ return;
+ }
+
+ memcpy(&the_new->head_node, &the_old->head_node, sizeof(list_node));
+ the_new->head_node.n->p = &the_new->head_node;
+ the_new->head_node.p->n = &the_new->head_node;
+ list_init(the_old);
+}
+
+LOCAL void
+list_merge(struct list_head *head, struct list_head *to_merge)
+{
+ if (list_empty(to_merge))
+ return;
+
+ list_node *merge_last_node = to_merge->head_node.p;
+ list_node *merge_first_node = to_merge->head_node.n;
+
+ merge_last_node->n = &head->head_node;
+ merge_first_node->p = head->head_node.p;
+ head->head_node.p->n = merge_first_node;
+ head->head_node.p = merge_last_node;
+ list_init(to_merge);
+}
+
+LOCAL cl_int
+cl_get_info_helper(const void *src, size_t src_size, void *dst, size_t dst_size, size_t *ret_size)
+{
+ if (dst && dst_size < src_size)
+ return CL_INVALID_VALUE;
+
+ if (dst && dst_size) {
+ memcpy(dst, src, src_size);
+ }
+
+ if (ret_size)
+ *ret_size = src_size;
+ return CL_SUCCESS;
+}
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 2926611..2d24207 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -19,6 +19,7 @@
#ifndef __CL_UTILS_H__
#define __CL_UTILS_H__
+#include "CL/cl.h"
/* INLINE is forceinline */
#define INLINE __attribute__((always_inline)) inline
@@ -124,7 +125,7 @@ do { \
err = CL_INVALID_CONTEXT; \
goto error; \
} \
- if (UNLIKELY(CTX->magic != CL_MAGIC_CONTEXT_HEADER)) { \
+ if (UNLIKELY(!CL_OBJECT_IS_CONTEXT(CTX))) { \
err = CL_INVALID_CONTEXT; \
goto error; \
} \
@@ -136,7 +137,7 @@ do { \
err = CL_INVALID_COMMAND_QUEUE; \
goto error; \
} \
- if (UNLIKELY(QUEUE->magic != CL_MAGIC_QUEUE_HEADER)) { \
+ if (UNLIKELY(!CL_OBJECT_IS_COMMAND_QUEUE(QUEUE))) { \
err = CL_INVALID_COMMAND_QUEUE; \
goto error; \
} \
@@ -148,7 +149,7 @@ do { \
err = CL_INVALID_MEM_OBJECT; \
goto error; \
} \
- if (UNLIKELY(MEM->magic != CL_MAGIC_MEM_HEADER)) { \
+ if (UNLIKELY(!CL_OBJECT_IS_MEM(MEM))) { \
err = CL_INVALID_MEM_OBJECT; \
goto error; \
} \
@@ -215,7 +216,7 @@ do { \
err = CL_INVALID_EVENT; \
goto error; \
} \
- if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) { \
+ if (UNLIKELY(!CL_OBJECT_IS_EVENT(EVENT))) { \
err = CL_INVALID_EVENT; \
goto error; \
} \
@@ -227,7 +228,7 @@ do { \
err = CL_INVALID_SAMPLER; \
goto error; \
} \
- if (UNLIKELY(SAMPLER->magic != CL_MAGIC_SAMPLER_HEADER)) {\
+ if (UNLIKELY(!CL_OBJECT_IS_SAMPLER(SAMPLER))) { \
err = CL_INVALID_SAMPLER; \
goto error; \
} \
@@ -239,7 +240,7 @@ do {
err = CL_INVALID_ACCELERATOR_INTEL; \
goto error; \
} \
- if (UNLIKELY(ACCELERATOR_INTEL->magic != CL_MAGIC_ACCELERATOR_INTEL_HEADER)) {\
+ if (UNLIKELY(!CL_OBJECT_IS_ACCELERATOR_INTEL(ACCELERATOR_INTEL))) { \
err = CL_INVALID_ACCELERATOR_INTEL; \
goto error; \
} \
@@ -251,7 +252,7 @@ do { \
err = CL_INVALID_KERNEL; \
goto error; \
} \
- if (UNLIKELY(KERNEL->magic != CL_MAGIC_KERNEL_HEADER)) { \
+ if (UNLIKELY(!CL_OBJECT_IS_KERNEL(KERNEL))) { \
err = CL_INVALID_KERNEL; \
goto error; \
} \
@@ -263,7 +264,7 @@ do { \
err = CL_INVALID_PROGRAM; \
goto error; \
} \
- if (UNLIKELY(PROGRAM->magic != CL_MAGIC_PROGRAM_HEADER)) {\
+ if (UNLIKELY(!CL_OBJECT_IS_PROGRAM(PROGRAM))) { \
err = CL_INVALID_PROGRAM; \
goto error; \
} \
@@ -351,9 +352,80 @@ static INLINE int atomic_add(atomic_t *v, const int c) {
: "m"(*v), "r"(i));
return i;
}
+static INLINE int atomic_read(atomic_t *v) {
+ return *v;
+}
static INLINE int atomic_inc(atomic_t *v) { return atomic_add(v, 1); }
static INLINE int atomic_dec(atomic_t *v) { return atomic_add(v, -1); }
-#endif /* __CL_UTILS_H__ */
+/* Define one list node. */
+typedef struct list_node {
+ struct list_node *n;
+ struct list_node *p;
+} list_node;
+typedef struct list_head {
+ list_node head_node;
+} list_head;
+
+static inline void list_node_init(list_node *node)
+{
+ node->n = node;
+ node->p = node;
+}
+static inline int list_node_out_of_list(const struct list_node *node)
+{
+ return node->n == node;
+}
+static inline void list_init(list_head *head)
+{
+ head->head_node.n = &head->head_node;
+ head->head_node.p = &head->head_node;
+}
+extern void list_node_insert_before(list_node *node, list_node *the_new);
+extern void list_node_insert_after(list_node *node, list_node *the_new);
+static inline void list_node_del(struct list_node *node)
+{
+ node->n->p = node->p;
+ node->p->n = node->n;
+ /* And all point to self for safe. */
+ node->p = node;
+ node->n = node;
+}
+static inline void list_add(list_head *head, list_node *the_new)
+{
+ list_node_insert_after(&head->head_node, the_new);
+}
+static inline void list_add_tail(list_head *head, list_node *the_new)
+{
+ list_node_insert_before(&head->head_node, the_new);
+}
+static inline int list_empty(const struct list_head *head)
+{
+ return head->head_node.n == &head->head_node;
+}
+/* Move the content from one head to another. */
+extern void list_move(struct list_head *the_old, struct list_head *the_new);
+/* Merge the content of the two lists to one head. */
+extern void list_merge(struct list_head *head, struct list_head *to_merge);
+
+#undef offsetof
+#ifdef __compiler_offsetof
+#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
+#else
+#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER)
+#endif
+#define list_entry(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) ); })
+
+#define list_for_each(pos, head) \
+ for (pos = (head)->head_node.n; pos != &((head)->head_node); pos = pos->n)
+#define list_for_each_safe(pos, ne, head) \
+ for (pos = (head)->head_node.n, ne = pos->n; pos != &((head)->head_node); \
+ pos = ne, ne = pos->n)
+
+extern cl_int cl_get_info_helper(const void *src, size_t src_size, void *dst,
+ size_t dst_size, size_t *ret_size);
+#endif /* __CL_UTILS_H__ */
diff --git a/src/intel/intel_cl_gl_share_image_info.h b/src/intel/intel_cl_gl_share_image_info.h
new file mode 100644
index 0000000..21fbbd1
--- /dev/null
+++ b/src/intel/intel_cl_gl_share_image_info.h
@@ -0,0 +1,18 @@
+#ifndef __INTEL_CL_GL_SHARE_IMAGE_INFO_
+#define __INTEL_CL_GL_SHARE_IMAGE_INFO_
+
+struct _intel_cl_gl_share_image_info {
+ int fd;
+ size_t w;
+ size_t h;
+ size_t depth;
+ size_t pitch;
+ int tiling;
+ size_t offset;
+ size_t tile_x;
+ size_t tile_y;
+ unsigned int gl_format;
+ size_t row_pitch, slice_pitch;
+};
+
+#endif
diff --git a/src/intel/intel_dri_resource_sharing.c b/src/intel/intel_dri_resource_sharing.c
deleted file mode 100644
index 188c1fa..0000000
--- a/src/intel/intel_dri_resource_sharing.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#define HAVE_PTHREAD 1
-#include <errno.h>
-#include <time.h>
-#include "main/context.h"
-#include "main/renderbuffer.h"
-#include "main/texobj.h"
-#include <stdbool.h>
-#include <string.h>
-#include <drm.h>
-#include <i915_drm.h>
-#include <intel_bufmgr.h>
-#include <GL/internal/dri_interface.h>
-#include "intel_mipmap_tree.h"
-#include "intel_regions.h"
-#include "intel_context.h"
-
-#include "intel_dri_resource_sharing.h"
-#include "intel_dri_resource_sharing_int.h"
-
-#include <dlfcn.h>
-/**
- * Sets up a DRIImage structure to point to our shared image in a region
- */
-static bool
-intel_setup_cl_region_from_mipmap_tree(void *driver,
- struct intel_context *intel,
- struct intel_mipmap_tree *mt,
- GLuint level, GLuint zoffset,
- struct _intel_dri_share_image_region *region)
-{
- unsigned int draw_x, draw_y;
- uint32_t mask_x, mask_y;
- struct intel_region *null_region = (struct intel_region *)NULL;
-
- intel_miptree_check_level_layer(mt, level, zoffset);
-
- _intel_region_get_tile_masks(mt->region, &mask_x, &mask_y, false);
- _intel_miptree_get_image_offset(mt, level, zoffset, &draw_x, &draw_y);
-
- region->w = mt->level[level].width;
- region->h = mt->level[level].height;
- region->tile_x = draw_x & mask_x;
- region->tile_y = draw_y & mask_y;
- region->tiling = mt->region->tiling;
- /* XXX hard code to 1 right now. */
- region->depth = 1;
- region->row_pitch = mt->region->pitch;
-
- region->offset = _intel_region_get_aligned_offset(mt->region,
- draw_x & ~mask_x,
- draw_y & ~mask_y,
- false);
- if (!_intel_region_flink(mt->region, ®ion->name))
- return false;
- _intel_region_reference(&null_region, mt->region);
- return true;
-}
-
-typedef void
-_mesa_test_texobj_completeness_t( const struct gl_context *ctx,
- struct gl_texture_object *t );
-_mesa_test_texobj_completeness_t *__mesa_test_texobj_completeness;
-
-typedef struct gl_texture_object *
-_mesa_lookup_texture_t( const struct gl_context *ctx, GLuint id);
-_mesa_lookup_texture_t *__mesa_lookup_texture;
-
-static struct gl_texture_object *
-intel_get_gl_obj_from_texture(void *driver,
- struct intel_context *intel,
- GLenum target, GLint level,
- GLuint texture, GLuint face)
-{
- struct gl_texture_object *obj;
- __mesa_lookup_texture = dlsym(driver, "_mesa_lookup_texture");
- obj = __mesa_lookup_texture(&intel->ctx, texture);
- if (!obj || obj->Target != target) {
- return NULL;
- }
-
- __mesa_test_texobj_completeness = dlsym(driver, "_mesa_test_texobj_completeness");
- __mesa_test_texobj_completeness(&intel->ctx, obj);
- if (!obj->_BaseComplete || (level > 0 && !obj->_MipmapComplete)) {
- return NULL;
- }
-
- if (level < obj->BaseLevel || level > obj->_MaxLevel) {
- return NULL;
- }
-
- return obj;
-}
-
-static GLenum
-get_cl_gl_format(mesa_format format)
-{
- switch (format) {
- case MESA_FORMAT_R8G8B8A8_UNORM:
- return GL_RGBA;
- case MESA_FORMAT_A8R8G8B8_UNORM:
- return GL_BGRA;
- default:
- return GL_BGRA;
- }
-}
-
-static bool
-intelAcquireTexture(void *driver, __DRIcontext *context, GLenum target,
- GLint level, GLuint texture, void *user_data)
-{
- struct _intel_dri_share_image_region *region = intel_dri_share_image_region(user_data);
- struct intel_context *intel = context->driverPrivate;
- struct gl_texture_object *obj;
- struct intel_texture_object *iobj;
- /* XXX Always be face 0? */
- GLuint face = 0;
-
- obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
- if (obj == NULL)
- return false;
- iobj = intel_texture_object(obj);
- region->gl_format = get_cl_gl_format(obj->Image[face][level]->TexFormat);
- return intel_setup_cl_region_from_mipmap_tree(driver, intel, iobj->mt, level, 0, region);
-}
-
-static bool
-intelReleaseTexture(void *driver, __DRIcontext *context, GLenum target,
- GLint level, GLuint texture)
-{
- struct intel_context *intel = context->driverPrivate;
- struct gl_texture_object *obj;
- struct intel_texture_object *iobj;
- /* XXX Always be face 0? */
- GLuint face = 0;
-
- obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
- if (obj == NULL)
- return false;
-
- iobj = intel_texture_object(obj);
- _intel_region_release(&iobj->mt->region);
- return true;
-}
-
-static bool
-intelAcquireBufferObj(void *driver, __DRIcontext *driContextPriv,
- GLuint bufobj, void *user_data)
-{
- return false;
-}
-
-static bool
-intelReleaseBufferObj(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
-{
- return false;
-}
-
-static bool
-intelAcquireRenderBuffer(void *driver, __DRIcontext *driContextPriv,
- GLuint bufobj, void *user_data)
-{
- return false;
-}
-
-static bool
-intelReleaseRenderBuffer(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
-{
- return false;
-}
-
-#include "cl_driver.h"
-void
-intel_set_cl_gl_callbacks(void)
-{
- cl_gl_acquire_texture = (cl_gl_acquire_texture_cb*)intelAcquireTexture;
- cl_gl_release_texture = (cl_gl_release_texture_cb*)intelReleaseTexture;
- cl_gl_acquire_buffer_object = (cl_gl_acquire_buffer_object_cb*)intelAcquireBufferObj;
- cl_gl_release_buffer_object = (cl_gl_release_buffer_object_cb*)intelReleaseBufferObj;
- cl_gl_acquire_render_buffer = (cl_gl_acquire_render_buffer_cb*)intelAcquireRenderBuffer;
- cl_gl_release_render_buffer = (cl_gl_release_render_buffer_cb*)intelReleaseRenderBuffer;
-}
diff --git a/src/intel/intel_dri_resource_sharing.h b/src/intel/intel_dri_resource_sharing.h
deleted file mode 100644
index 6d2ce4d..0000000
--- a/src/intel/intel_dri_resource_sharing.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __INTEL_DRI_RESOURCE_SHARING_H__
-#define __INTEL_DRI_RESOURCE_SHARING_H__
-
-struct _intel_dri_share_image_region {
- unsigned int name;
- size_t w;
- size_t h;
- size_t depth;
- size_t pitch;
- int tiling;
- size_t offset;
- size_t tile_x;
- size_t tile_y;
- unsigned int gl_format;
- size_t row_pitch, slice_pitch;
-};
-
-struct _intel_dri_share_buffer_object {
- unsigned int name;
- size_t sz;
- size_t offset;
-};
-
-inline static struct _intel_dri_share_image_region *
-intel_dri_share_image_region(void *user_data)
-{
- return (struct _intel_dri_share_image_region *)user_data;
-}
-
-inline static struct _intel_dri_share_buffer_object *
-intel_dri_share_buffer_object(void *user_data)
-{
- return (struct _intel_dri_share_buffer_object *)user_data;
-}
-
-extern void intel_set_cl_gl_callbacks(void);
-
-
-#endif
diff --git a/src/intel/intel_dri_resource_sharing_int.h b/src/intel/intel_dri_resource_sharing_int.h
deleted file mode 100644
index c7b283a..0000000
--- a/src/intel/intel_dri_resource_sharing_int.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*****************************************************************
- * The following functions are copied from i965 driver, commit
- * id 292368570a13501dfa95b1b0dd70966caf6ffc6b. Need to keep consistant
- * with the dri driver installed on current system.
- *****************************************************************/
-static bool
-_intel_region_flink(struct intel_region *region, uint32_t *name)
-{
- if (region->name == 0) {
- if (drm_intel_bo_flink(region->bo, ®ion->name))
- return false;
- }
-
- *name = region->name;
-
- return true;
-}
-
-#define _DBG(...)
-static void
-_intel_region_release(struct intel_region **region_handle)
-{
- struct intel_region *region = *region_handle;
-
- if (region == NULL) {
- _DBG("%s NULL\n", __FUNCTION__);
- return;
- }
-
- _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
-
- ASSERT(region->refcount > 0);
- region->refcount--;
-
- if (region->refcount == 0) {
- drm_intel_bo_unreference(region->bo);
-
- free(region);
- }
- *region_handle = NULL;
-}
-
-static void
-_intel_region_reference(struct intel_region **dst, struct intel_region *src)
-{
- _DBG("%s: %p(%d) -> %p(%d)\n", __FUNCTION__,
- *dst, *dst ? (*dst)->refcount : 0, src, src ? src->refcount : 0);
-
- if (src != *dst) {
- if (*dst)
- _intel_region_release(dst);
-
- if (src)
- src->refcount++;
- *dst = src;
- }
-}
-
-/**
- * This function computes masks that may be used to select the bits of the X
- * and Y coordinates that indicate the offset within a tile. If the region is
- * untiled, the masks are set to 0.
- */
-static void
-_intel_region_get_tile_masks(struct intel_region *region,
- uint32_t *mask_x, uint32_t *mask_y,
- bool map_stencil_as_y_tiled)
-{
- int cpp = region->cpp;
- uint32_t tiling = region->tiling;
-
- if (map_stencil_as_y_tiled)
- tiling = I915_TILING_Y;
-
- switch (tiling) {
- default:
- assert(false);
- case I915_TILING_NONE:
- *mask_x = *mask_y = 0;
- break;
- case I915_TILING_X:
- *mask_x = 512 / cpp - 1;
- *mask_y = 7;
- break;
- case I915_TILING_Y:
- *mask_x = 128 / cpp - 1;
- *mask_y = 31;
- break;
- }
-}
-
-/**
- * Compute the offset (in bytes) from the start of the region to the given x
- * and y coordinate. For tiled regions, caller must ensure that x and y are
- * multiples of the tile size.
- */
-static uint32_t
-_intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
- uint32_t y, bool map_stencil_as_y_tiled)
-{
- int cpp = region->cpp;
- uint32_t pitch = region->pitch;
- uint32_t tiling = region->tiling;
-
- if (map_stencil_as_y_tiled) {
- tiling = I915_TILING_Y;
-
- /* When mapping a W-tiled stencil buffer as Y-tiled, each 64-high W-tile
- * gets transformed into a 32-high Y-tile. Accordingly, the pitch of
- * the resulting region is twice the pitch of the original region, since
- * each row in the Y-tiled view corresponds to two rows in the actual
- * W-tiled surface. So we need to correct the pitch before computing
- * the offsets.
- */
- pitch *= 2;
- }
-
- switch (tiling) {
- default:
- assert(false);
- case I915_TILING_NONE:
- return y * pitch + x * cpp;
- case I915_TILING_X:
- assert((x % (512 / cpp)) == 0);
- assert((y % 8) == 0);
- return y * pitch + x / (512 / cpp) * 4096;
- case I915_TILING_Y:
- assert((x % (128 / cpp)) == 0);
- assert((y % 32) == 0);
- return y * pitch + x / (128 / cpp) * 4096;
- }
-}
-
-static void
-_intel_miptree_get_image_offset(struct intel_mipmap_tree *mt,
- GLuint level, GLuint slice,
- GLuint *x, GLuint *y)
-{
- assert(slice < mt->level[level].depth);
-
- *x = mt->level[level].slice[slice].x_offset;
- *y = mt->level[level].slice[slice].y_offset;
-}
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 7a46c1d..b8a1b52 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -46,10 +46,11 @@
*
*/
-#if defined(HAS_EGL)
+#if defined(HAS_GL_EGL)
+#define EGL_EGLEXT_PROTOTYPES
#include "GL/gl.h"
#include "EGL/egl.h"
-#include "x11/mesa_egl_extension.h"
+#include <EGL/eglext.h>
#endif
#ifdef HAS_X11
@@ -99,9 +100,9 @@ intel_driver_new(void)
exit:
return driver;
error:
- intel_driver_delete(driver);
- driver = NULL;
- goto exit;
+intel_driver_delete(driver);
+driver = NULL;
+goto exit;
}
/* just used for maximum relocation number in drm_intel */
@@ -111,372 +112,385 @@ error:
static void
intel_driver_aub_dump(intel_driver_t *driver)
{
- char *val;
- val = getenv("OCL_DUMP_AUB");
- if (!val)
- return;
- if (atoi(val) != 0) {
- drm_intel_bufmgr_gem_set_aub_filename(driver->bufmgr,
- "beignet.aub");
- drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
- }
+char *val;
+val = getenv("OCL_DUMP_AUB");
+if (!val)
+ return;
+if (atoi(val) != 0) {
+ drm_intel_bufmgr_gem_set_aub_filename(driver->bufmgr,
+ "beignet.aub");
+ drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
+}
}
static int
intel_driver_memman_init(intel_driver_t *driver)
{
- driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
- if (!driver->bufmgr) return 0;
- drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
- driver->device_id = drm_intel_bufmgr_gem_get_devid(driver->bufmgr);
- intel_driver_aub_dump(driver);
- return 1;
+driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
+if (!driver->bufmgr) return 0;
+drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
+driver->device_id = drm_intel_bufmgr_gem_get_devid(driver->bufmgr);
+intel_driver_aub_dump(driver);
+return 1;
}
-static void
+static int
intel_driver_context_init(intel_driver_t *driver)
{
- driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
- assert(driver->ctx);
+driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
+if (!driver->ctx)
+ return 0;
+driver->null_bo = NULL;
+#ifdef HAS_BO_SET_SOFTPIN
+drm_intel_bo *bo = dri_bo_alloc(driver->bufmgr, "null_bo", 64*1024, 4096);
+drm_intel_bo_set_softpin_offset(bo, 0);
+// don't reuse it, that would make two bo trying to bind to same address,
+// which is un-reasonable.
+drm_intel_bo_disable_reuse(bo);
+driver->null_bo = bo;
+#endif
+return 1;
}
static void
intel_driver_context_destroy(intel_driver_t *driver)
{
- if(driver->ctx)
- drm_intel_gem_context_destroy(driver->ctx);
- driver->ctx = NULL;
+if (driver->null_bo)
+ drm_intel_bo_unreference(driver->null_bo);
+if(driver->ctx)
+ drm_intel_gem_context_destroy(driver->ctx);
+driver->ctx = NULL;
}
static int
intel_driver_init(intel_driver_t *driver, int dev_fd)
{
- driver->fd = dev_fd;
- driver->locked = 0;
- pthread_mutex_init(&driver->ctxmutex, NULL);
+driver->fd = dev_fd;
+driver->locked = 0;
+pthread_mutex_init(&driver->ctxmutex, NULL);
- if (!intel_driver_memman_init(driver)) return 0;
- intel_driver_context_init(driver);
+if (!intel_driver_memman_init(driver)) return 0;
+if (!intel_driver_context_init(driver)) return 0;
#if EMULATE_GEN
- driver->gen_ver = EMULATE_GEN;
- if (EMULATE_GEN == 75)
- driver->device_id = PCI_CHIP_HASWELL_L; /* we pick L for HSW */
- else if (EMULATE_GEN == 7)
- driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
- else if (EMULATE_GEN == 6)
- driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
- else
- FATAL ("Unsupported Gen for emulation");
+driver->gen_ver = EMULATE_GEN;
+if (EMULATE_GEN == 75)
+ driver->device_id = PCI_CHIP_HASWELL_L; /* we pick L for HSW */
+else if (EMULATE_GEN == 7)
+ driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
+else if (EMULATE_GEN == 6)
+ driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
+else
+ FATAL ("Unsupported Gen for emulation");
#else
- if (IS_GEN9(driver->device_id))
- driver->gen_ver = 9;
- else if (IS_GEN8(driver->device_id))
- driver->gen_ver = 8;
- else if (IS_GEN75(driver->device_id))
- driver->gen_ver = 75;
- else if (IS_GEN7(driver->device_id))
- driver->gen_ver = 7;
- else if (IS_GEN6(driver->device_id))
- driver->gen_ver = 6;
- else if(IS_IGDNG(driver->device_id))
- driver->gen_ver = 5;
- else
- driver->gen_ver = 4;
+if (IS_GEN9(driver->device_id))
+ driver->gen_ver = 9;
+else if (IS_GEN8(driver->device_id))
+ driver->gen_ver = 8;
+else if (IS_GEN75(driver->device_id))
+ driver->gen_ver = 75;
+else if (IS_GEN7(driver->device_id))
+ driver->gen_ver = 7;
+else if (IS_GEN6(driver->device_id))
+ driver->gen_ver = 6;
+else if(IS_IGDNG(driver->device_id))
+ driver->gen_ver = 5;
+else
+ driver->gen_ver = 4;
#endif /* EMULATE_GEN */
- return 1;
+return 1;
}
static cl_int
intel_driver_open(intel_driver_t *intel, cl_context_prop props)
{
- int cardi;
+int cardi;
#ifdef HAS_X11
- char *driver_name;
+char *driver_name;
#endif
- if (props != NULL
- && props->gl_type != CL_GL_NOSHARE
- && props->gl_type != CL_GL_GLX_DISPLAY
- && props->gl_type != CL_GL_EGL_DISPLAY) {
- fprintf(stderr, "Unsupported gl share type %d.\n", props->gl_type);
- return CL_INVALID_OPERATION;
- }
+if (props != NULL
+ && props->gl_type != CL_GL_NOSHARE
+ && props->gl_type != CL_GL_GLX_DISPLAY
+ && props->gl_type != CL_GL_EGL_DISPLAY) {
+ fprintf(stderr, "Unsupported gl share type %d.\n", props->gl_type);
+ return CL_INVALID_OPERATION;
+}
#ifdef HAS_X11
- intel->x11_display = XOpenDisplay(NULL);
-
- if(intel->x11_display) {
- if((intel->dri_ctx = getDRI2State(intel->x11_display,
- DefaultScreen(intel->x11_display),
- &driver_name))) {
- intel_driver_init_shared(intel, intel->dri_ctx);
- Xfree(driver_name);
- }
- else
- fprintf(stderr, "X server found. dri2 connection failed! \n");
+intel->x11_display = XOpenDisplay(NULL);
+
+if(intel->x11_display) {
+ if((intel->dri_ctx = getDRI2State(intel->x11_display,
+ DefaultScreen(intel->x11_display),
+ &driver_name))) {
+ intel_driver_init_shared(intel, intel->dri_ctx);
+ Xfree(driver_name);
}
+ else
+ fprintf(stderr, "X server found. dri2 connection failed! \n");
+}
#endif
- if(!intel_driver_is_active(intel)) {
- char card_name[20];
- for(cardi = 0; cardi < 16; cardi++) {
- sprintf(card_name, "/dev/dri/renderD%d", 128+cardi);
- if (access(card_name, R_OK) != 0)
- continue;
- if(intel_driver_init_render(intel, card_name))
- break;
- }
+if(!intel_driver_is_active(intel)) {
+ char card_name[20];
+ for(cardi = 0; cardi < 16; cardi++) {
+ sprintf(card_name, "/dev/dri/renderD%d", 128+cardi);
+ if (access(card_name, R_OK) != 0)
+ continue;
+ if(intel_driver_init_render(intel, card_name))
+ break;
}
+}
- if(!intel_driver_is_active(intel)) {
- char card_name[20];
- for(cardi = 0; cardi < 16; cardi++) {
- sprintf(card_name, "/dev/dri/card%d", cardi);
- if (access(card_name, R_OK) != 0)
- continue;
- if(intel_driver_init_master(intel, card_name))
- break;
- }
+if(!intel_driver_is_active(intel)) {
+ char card_name[20];
+ for(cardi = 0; cardi < 16; cardi++) {
+ sprintf(card_name, "/dev/dri/card%d", cardi);
+ if (access(card_name, R_OK) != 0)
+ continue;
+ if(intel_driver_init_master(intel, card_name))
+ break;
}
+}
- if(!intel_driver_is_active(intel)) {
- fprintf(stderr, "Device open failed, aborting...\n");
- return CL_DEVICE_NOT_FOUND;
- }
+if(!intel_driver_is_active(intel)) {
+ fprintf(stderr, "Device open failed, aborting...\n");
+ return CL_DEVICE_NOT_FOUND;
+}
-#ifdef HAS_EGL
- if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
- assert(props->egl_display);
- }
+#ifdef HAS_GL_EGL
+if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
+ assert(props->egl_display);
+}
#endif
- return CL_SUCCESS;
+return CL_SUCCESS;
}
static void
intel_driver_close(intel_driver_t *intel)
{
- //Due to the drm change about the test usrptr, we need to destroy the bufmgr
- //befor the driver was closed, otherwise the test usrptr will not be freed.
- if (intel->bufmgr)
- drm_intel_bufmgr_destroy(intel->bufmgr);
+//Due to the drm change about the test usrptr, we need to destroy the bufmgr
+//befor the driver was closed, otherwise the test usrptr will not be freed.
+if (intel->bufmgr)
+ drm_intel_bufmgr_destroy(intel->bufmgr);
#ifdef HAS_X11
- if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
- if(intel->x11_display) XCloseDisplay(intel->x11_display);
+if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
+if(intel->x11_display) XCloseDisplay(intel->x11_display);
#endif
- if(intel->need_close) {
- close(intel->fd);
- intel->need_close = 0;
- }
- intel->dri_ctx = NULL;
- intel->x11_display = NULL;
- intel->fd = -1;
+if(intel->need_close) {
+ close(intel->fd);
+ intel->need_close = 0;
+}
+intel->dri_ctx = NULL;
+intel->x11_display = NULL;
+intel->fd = -1;
}
LOCAL int
intel_driver_is_active(intel_driver_t *driver) {
- return driver->fd >= 0;
+return driver->fd >= 0;
}
#ifdef HAS_X11
LOCAL int
intel_driver_init_shared(intel_driver_t *driver, dri_state_t *state)
{
- int ret;
- assert(state);
- if(state->driConnectedFlag != DRI2)
- return 0;
- ret = intel_driver_init(driver, state->fd);
- driver->need_close = 0;
- return ret;
+int ret;
+assert(state);
+if(state->driConnectedFlag != DRI2)
+ return 0;
+ret = intel_driver_init(driver, state->fd);
+driver->need_close = 0;
+return ret;
}
#endif
LOCAL int
intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
{
- int dev_fd, ret;
+int dev_fd, ret;
- drm_client_t client;
+drm_client_t client;
- // usually dev_name = "/dev/dri/card%d"
- dev_fd = open(dev_name, O_RDWR);
- if (dev_fd == -1) {
- fprintf(stderr, "open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
- return 0;
- }
+// usually dev_name = "/dev/dri/card%d"
+dev_fd = open(dev_name, O_RDWR);
+if (dev_fd == -1) {
+ fprintf(stderr, "open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
+ return 0;
+}
- // Check that we're authenticated
- memset(&client, 0, sizeof(drm_client_t));
- ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
- if (ret == -1) {
- fprintf(stderr, "ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client) failed: %s\n", strerror(errno));
- close(dev_fd);
- return 0;
- }
+// Check that we're authenticated
+memset(&client, 0, sizeof(drm_client_t));
+ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
+if (ret == -1) {
+ fprintf(stderr, "ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client) failed: %s\n", strerror(errno));
+ close(dev_fd);
+ return 0;
+}
- if (!client.auth) {
- fprintf(stderr, "%s not authenticated\n", dev_name);
- close(dev_fd);
- return 0;
- }
+if (!client.auth) {
+ fprintf(stderr, "%s not authenticated\n", dev_name);
+ close(dev_fd);
+ return 0;
+}
- ret = intel_driver_init(driver, dev_fd);
- driver->need_close = 1;
+ret = intel_driver_init(driver, dev_fd);
+driver->need_close = 1;
- return ret;
+return ret;
}
LOCAL int
intel_driver_init_render(intel_driver_t *driver, const char* dev_name)
{
- int dev_fd, ret;
+int dev_fd, ret;
- dev_fd = open(dev_name, O_RDWR);
- if (dev_fd == -1)
- return 0;
+dev_fd = open(dev_name, O_RDWR);
+if (dev_fd == -1)
+ return 0;
- ret = intel_driver_init(driver, dev_fd);
- driver->need_close = 1;
+ret = intel_driver_init(driver, dev_fd);
+driver->need_close = 1;
- return ret;
+return ret;
}
LOCAL int
intel_driver_terminate(intel_driver_t *driver)
{
- pthread_mutex_destroy(&driver->ctxmutex);
+pthread_mutex_destroy(&driver->ctxmutex);
- if(driver->need_close) {
- close(driver->fd);
- driver->need_close = 0;
- }
- driver->fd = -1;
- return 1;
+if(driver->need_close) {
+ close(driver->fd);
+ driver->need_close = 0;
+}
+driver->fd = -1;
+return 1;
}
LOCAL void
intel_driver_lock_hardware(intel_driver_t *driver)
{
- PPTHREAD_MUTEX_LOCK(driver);
- assert(!driver->locked);
- driver->locked = 1;
+PPTHREAD_MUTEX_LOCK(driver);
+assert(!driver->locked);
+driver->locked = 1;
}
LOCAL void
intel_driver_unlock_hardware(intel_driver_t *driver)
{
- driver->locked = 0;
- PPTHREAD_MUTEX_UNLOCK(driver);
+driver->locked = 0;
+PPTHREAD_MUTEX_UNLOCK(driver);
}
LOCAL dri_bo*
intel_driver_share_buffer_from_name(intel_driver_t *driver, const char *sname, uint32_t name)
{
- dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
- sname,
- name);
- if (bo == NULL) {
- fprintf(stderr, "intel_bo_gem_create_from_name create \"%s\" bo from name %d failed: %s\n", sname, name, strerror(errno));
- return NULL;
- }
- return bo;
+dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
+ sname,
+ name);
+if (bo == NULL) {
+ fprintf(stderr, "intel_bo_gem_create_from_name create \"%s\" bo from name %d failed: %s\n", sname, name, strerror(errno));
+ return NULL;
+}
+return bo;
}
LOCAL dri_bo*
intel_driver_share_buffer_from_fd(intel_driver_t *driver, int fd, int size)
{
- dri_bo *bo = drm_intel_bo_gem_create_from_prime(driver->bufmgr,
- fd,
- size);
- if (bo == NULL) {
- fprintf(stderr, "drm_intel_bo_gem_create_from_prime create bo(size %d) from fd %d failed: %s\n", size, fd, strerror(errno));
- return NULL;
- }
- return bo;
+dri_bo *bo = drm_intel_bo_gem_create_from_prime(driver->bufmgr,
+ fd,
+ size);
+if (bo == NULL) {
+ fprintf(stderr, "drm_intel_bo_gem_create_from_prime create bo(size %d) from fd %d failed: %s\n", size, fd, strerror(errno));
+ return NULL;
+}
+return bo;
}
LOCAL uint32_t
intel_driver_shared_name(intel_driver_t *driver, dri_bo *bo)
{
- uint32_t name;
- assert(bo);
- dri_bo_flink(bo, &name);
- return name;
+uint32_t name;
+assert(bo);
+dri_bo_flink(bo, &name);
+return name;
}
/* XXX a null props is ok? */
static int
intel_get_device_id(void)
{
- intel_driver_t *driver = NULL;
- int intel_device_id;
-
- driver = intel_driver_new();
- assert(driver != NULL);
- if(UNLIKELY(intel_driver_open(driver, NULL) != CL_SUCCESS)) return INVALID_CHIP_ID;
- intel_device_id = driver->device_id;
- intel_driver_context_destroy(driver);
- intel_driver_close(driver);
- intel_driver_terminate(driver);
- intel_driver_delete(driver);
-
- return intel_device_id;
+intel_driver_t *driver = NULL;
+int intel_device_id;
+
+driver = intel_driver_new();
+assert(driver != NULL);
+if(UNLIKELY(intel_driver_open(driver, NULL) != CL_SUCCESS)) return INVALID_CHIP_ID;
+intel_device_id = driver->device_id;
+intel_driver_context_destroy(driver);
+intel_driver_close(driver);
+intel_driver_terminate(driver);
+intel_driver_delete(driver);
+
+return intel_device_id;
}
extern void intel_gpgpu_delete_all(intel_driver_t *driver);
static void
cl_intel_driver_delete(intel_driver_t *driver)
{
- if (driver == NULL)
- return;
- intel_gpgpu_delete_all(driver);
- intel_driver_context_destroy(driver);
- intel_driver_close(driver);
- intel_driver_terminate(driver);
- intel_driver_delete(driver);
+if (driver == NULL)
+ return;
+intel_gpgpu_delete_all(driver);
+intel_driver_context_destroy(driver);
+intel_driver_close(driver);
+intel_driver_terminate(driver);
+intel_driver_delete(driver);
}
#include "cl_gbe_loader.h"
static intel_driver_t*
cl_intel_driver_new(cl_context_prop props)
{
- intel_driver_t *driver = NULL;
- TRY_ALLOC_NO_ERR (driver, intel_driver_new());
- if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
+intel_driver_t *driver = NULL;
+TRY_ALLOC_NO_ERR (driver, intel_driver_new());
+if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
exit:
- return driver;
+return driver;
error:
- cl_intel_driver_delete(driver);
- driver = NULL;
- goto exit;
+cl_intel_driver_delete(driver);
+driver = NULL;
+goto exit;
}
static drm_intel_bufmgr*
intel_driver_get_bufmgr(intel_driver_t *drv)
{
- return drv->bufmgr;
+return drv->bufmgr;
}
static uint32_t
intel_driver_get_ver(struct intel_driver *drv)
{
- return drv->gen_ver;
+return drv->gen_ver;
}
static void
intel_driver_enlarge_stack_size(struct intel_driver *drv, int32_t *stack_size)
{
- if (drv->gen_ver == 75)
- *stack_size = *stack_size * 4;
- else if (drv->device_id == PCI_CHIP_BROXTON_1 || drv->device_id == PCI_CHIP_BROXTON_3 ||
- IS_CHERRYVIEW(drv->device_id))
- *stack_size = *stack_size * 2;
+ if (drv->gen_ver == 75)
+ *stack_size = *stack_size * 4;
+ else if (drv->device_id == PCI_CHIP_BROXTON_1 || drv->device_id == PCI_CHIP_BROXTON_3 ||
+ IS_CHERRYVIEW(drv->device_id))
+ *stack_size = *stack_size * 2;
}
static void
intel_driver_set_atomic_flag(intel_driver_t *drv, int atomic_flag)
{
- drv->atomic_test_result = atomic_flag;
+drv->atomic_test_result = atomic_flag;
}
static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
@@ -484,465 +498,525 @@ static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
static int get_cl_tiling(uint32_t drm_tiling)
{
- switch(drm_tiling) {
- case I915_TILING_X: return CL_TILE_X;
- case I915_TILING_Y: return CL_TILE_Y;
- case I915_TILING_NONE: return CL_NO_TILE;
- default:
- assert(0);
- }
- return CL_NO_TILE;
+switch(drm_tiling) {
+case I915_TILING_X: return CL_TILE_X;
+case I915_TILING_Y: return CL_TILE_Y;
+case I915_TILING_NONE: return CL_NO_TILE;
+default:
+ assert(0);
+}
+return CL_NO_TILE;
}
static uint32_t intel_buffer_get_tiling_align(cl_context ctx, uint32_t tiling_mode, uint32_t dim)
{
- uint32_t gen_ver = ((intel_driver_t *)ctx->drv)->gen_ver;
- uint32_t ret = 0;
-
- switch (tiling_mode) {
- case CL_TILE_X:
- if (dim == 0) { //tileX width in bytes
- ret = 512;
- } else if (dim == 1) { //tileX height in number of rows
+uint32_t gen_ver = ((intel_driver_t *)ctx->drv)->gen_ver;
+uint32_t ret = 0;
+
+switch (tiling_mode) {
+case CL_TILE_X:
+ if (dim == 0) { //tileX width in bytes
+ ret = 512;
+ } else if (dim == 1) { //tileX height in number of rows
+ ret = 8;
+ } else if (dim == 2) { //height to calculate slice pitch
+ if (gen_ver == 9) //SKL same as tileY height
ret = 8;
- } else if (dim == 2) { //height to calculate slice pitch
- if (gen_ver == 9) //SKL same as tileY height
- ret = 8;
- else if (gen_ver == 8) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
- ret = 4;
- else
- ret = 2;
- } else
- assert(0);
- break;
-
- case CL_TILE_Y:
- if (dim == 0) { //tileY width in bytes
- ret = 128;
- } else if (dim == 1) { //tileY height in number of rows
+ else if (gen_ver == 8) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
+ ret = 4;
+ else
+ ret = 2;
+ } else
+ assert(0);
+ break;
+
+case CL_TILE_Y:
+ if (dim == 0) { //tileY width in bytes
+ ret = 128;
+ } else if (dim == 1) { //tileY height in number of rows
+ ret = 32;
+ } else if (dim == 2) { //height to calculate slice pitch
+ if (gen_ver == 9) //SKL same as tileY height
ret = 32;
- } else if (dim == 2) { //height to calculate slice pitch
- if (gen_ver == 9) //SKL same as tileY height
- ret = 32;
- else if (gen_ver == 8) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
- ret = 4;
- else
- ret = 2;
- } else
- assert(0);
- break;
+ else if (gen_ver == 8) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
+ ret = 4;
+ else
+ ret = 2;
+ } else
+ assert(0);
+ break;
- case CL_NO_TILE:
- if (dim == 1 || dim == 2) { //vertical alignment
- if (gen_ver == 8 || gen_ver == 9) //SKL 1D array need 4 alignment qpitch
- ret = 4;
- else
- ret = 2;
- } else
- assert(0);
- break;
- }
+case CL_NO_TILE:
+ if (dim == 1 || dim == 2) { //vertical alignment
+ if (gen_ver == 8 || gen_ver == 9) //SKL 1D array need 4 alignment qpitch
+ ret = 4;
+ else
+ ret = 2;
+ } else
+ assert(0);
+ break;
+}
- return ret;
+return ret;
}
-#if defined(HAS_EGL)
-#include "intel_dri_resource_sharing.h"
+#if defined(HAS_GL_EGL)
+#include "intel_cl_gl_share_image_info.h"
#include "cl_image.h"
+
+static PFNEGLEXPORTDMABUFIMAGEMESAPROC eglExportDMABUFImageMESA_func = NULL;
+
+static int
+get_required_egl_extensions(){
+
+if(eglExportDMABUFImageMESA_func == NULL){
+ eglExportDMABUFImageMESA_func = (PFNEGLEXPORTDMABUFIMAGEMESAPROC) eglGetProcAddress("eglExportDMABUFImageMESA");
+ if(eglExportDMABUFImageMESA_func == NULL){
+ fprintf(stderr, "Failed to get EGL extension function eglExportDMABUFImageMESA\n");
+ return -1;
+ }
+}
+return 0;
+}
+
+
static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
{
- cl_int ret = CL_SUCCESS;
-
- switch (tex_format) {
- case GL_RGBA8:
- case GL_RGBA:
- case GL_RGBA16:
- case GL_RGBA8I:
- case GL_RGBA16I:
- case GL_RGBA32I:
- case GL_RGBA8UI:
- case GL_RGBA16UI:
- case GL_RGBA32UI:
- case GL_RGBA16F:
- case GL_RGBA32F:
- cl_format->image_channel_order = CL_RGBA;
- break;
- case GL_BGRA:
- cl_format->image_channel_order = CL_BGRA;
- break;
- default:
- ret = -1;
- goto error;
- }
+cl_int ret = CL_SUCCESS;
+
+switch (tex_format) {
+case GL_RGBA8:
+case GL_RGBA:
+case GL_RGBA16:
+case GL_RGBA8I:
+case GL_RGBA16I:
+case GL_RGBA32I:
+case GL_RGBA8UI:
+case GL_RGBA16UI:
+case GL_RGBA32UI:
+case GL_RGBA16F:
+case GL_RGBA32F:
+ cl_format->image_channel_order = CL_RGBA;
+ break;
+case GL_BGRA:
+ cl_format->image_channel_order = CL_BGRA;
+ break;
+default:
+ ret = -1;
+ goto error;
+}
- switch (tex_format) {
- case GL_RGBA8:
- case GL_RGBA:
- case GL_BGRA:
- cl_format->image_channel_data_type = CL_UNORM_INT8;
- break;
- case GL_RGBA16:
- cl_format->image_channel_data_type = CL_UNORM_INT16;
- break;
- case GL_RGBA8I:
- cl_format->image_channel_data_type = CL_SIGNED_INT8;
- break;
- case GL_RGBA16I:
- cl_format->image_channel_data_type = CL_SIGNED_INT16;
- break;
- case GL_RGBA32I:
- cl_format->image_channel_data_type = CL_SIGNED_INT32;
- break;
- case GL_RGBA8UI:
- cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
- break;
- case GL_RGBA16UI:
- cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
- break;
- case GL_RGBA32UI:
- cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
- break;
- case GL_RGBA16F:
- cl_format->image_channel_data_type = CL_HALF_FLOAT;
- break;
- case GL_RGBA32F:
- cl_format->image_channel_order = CL_FLOAT;
- break;
- default:
- ret = -1;
- goto error;
- }
+switch (tex_format) {
+case GL_RGBA8:
+case GL_RGBA:
+case GL_BGRA:
+ cl_format->image_channel_data_type = CL_UNORM_INT8;
+ break;
+case GL_RGBA16:
+ cl_format->image_channel_data_type = CL_UNORM_INT16;
+ break;
+case GL_RGBA8I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT8;
+ break;
+case GL_RGBA16I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT16;
+ break;
+case GL_RGBA32I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT32;
+ break;
+case GL_RGBA8UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
+ break;
+case GL_RGBA16UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
+ break;
+case GL_RGBA32UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
+ break;
+case GL_RGBA16F:
+ cl_format->image_channel_data_type = CL_HALF_FLOAT;
+ break;
+case GL_RGBA32F:
+ cl_format->image_channel_order = CL_FLOAT;
+ break;
+default:
+ ret = -1;
+ goto error;
+}
error:
- return ret;
+return ret;
}
static int
get_mem_type_from_target(GLenum texture_target, cl_mem_object_type *type)
{
- switch(texture_target) {
- case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
- case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
- case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
- case GL_TEXTURE_1D_ARRAY: *type = CL_MEM_OBJECT_IMAGE1D_ARRAY; break;
- case GL_TEXTURE_2D_ARRAY: *type = CL_MEM_OBJECT_IMAGE2D_ARRAY; break;
- default:
- return -1;
- }
- return CL_SUCCESS;
+switch(texture_target) {
+case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
+case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
+case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
+case GL_TEXTURE_1D_ARRAY: *type = CL_MEM_OBJECT_IMAGE1D_ARRAY; break;
+case GL_TEXTURE_2D_ARRAY: *type = CL_MEM_OBJECT_IMAGE2D_ARRAY; break;
+default:
+ return -1;
+}
+return CL_SUCCESS;
}
static cl_buffer
intel_alloc_buffer_from_texture_egl(cl_context ctx, unsigned int target,
- int miplevel, unsigned int texture,
- struct _cl_mem_image *image)
-{
- cl_buffer bo = (cl_buffer) NULL;
- struct _intel_dri_share_image_region region;
- unsigned int bpp, intel_fmt;
- cl_image_format cl_format;
- EGLBoolean ret;
- EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
- EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
- EGL_GL_TEXTURE_TARGET_MESA, target,
- EGL_NONE};
- ret = eglAcquireResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx),
- EGL_GL_TEXTURE_MESA,
- &attrib_list[0], ®ion);
- if (!ret)
- goto out;
-
- bo = (cl_buffer)intel_driver_share_buffer((intel_driver_t *)ctx->drv, "rendering buffer", region.name);
-
- if (bo == NULL) {
- eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
- goto out;
- }
- region.tiling = get_cl_tiling(region.tiling);
- if (cl_get_clformat_from_texture(region.gl_format, &cl_format) != 0)
- goto error;
-
- if (cl_image_byte_per_pixel(&cl_format, &bpp) != CL_SUCCESS)
- goto error;
- intel_fmt = cl_image_get_intel_format(&cl_format);
- if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
- goto error;
- cl_mem_object_type image_type;
- if (get_mem_type_from_target(target, &image_type) != 0)
- goto error;
-
- cl_mem_image_init(image, region.w, region.h,
- image_type, region.depth, cl_format,
- intel_fmt, bpp, region.row_pitch,
- region.slice_pitch, region.tiling,
- region.tile_x, region.tile_y, region.offset);
-out:
- return bo;
+ int miplevel, unsigned int texture,
+ struct _cl_mem_image *image)
+{
+drm_intel_bo *intel_bo = NULL;
+struct _intel_cl_gl_share_image_info info;
+unsigned int bpp, intel_fmt;
+cl_image_format cl_format;
+EGLBoolean ret;
+
+EGLenum e_target;
+//We just support GL_TEXTURE_2D because we can't query info like slice_pitch now.
+if(target == GL_TEXTURE_2D)
+ e_target = EGL_GL_TEXTURE_2D;
+else
+ return NULL;
-error:
- cl_buffer_unreference(bo);
- eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+if(get_required_egl_extensions() != 0)
+ return NULL;
+
+EGLAttrib attrib_list[] = {EGL_GL_TEXTURE_LEVEL, miplevel,
+ EGL_NONE};
+EGLImage e_image = eglCreateImage(EGL_DISP(ctx), EGL_CTX(ctx), e_target,
+ (EGLClientBuffer)texture, &attrib_list[0]);
+if(e_image == EGL_NO_IMAGE)
+ return NULL;
+
+int fd, stride, offset;
+ret = eglExportDMABUFImageMESA_func(EGL_DISP(ctx), e_image, &fd, &stride, &offset);
+if(ret != EGL_TRUE){
+ eglDestroyImage(EGL_DISP(ctx), e_image);
+ return NULL;
+}
+info.fd = fd;
+
+/* The size argument just takes effect in intel_driver_share_buffer_from_fd when
+ * Linux kernel is older than 3.12, so it doesn't matter we set to 0 here.
+ */
+int size = 0;
+intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, size);
+
+if (intel_bo == NULL) {
+ eglDestroyImage(EGL_DISP(ctx), e_image);
return NULL;
}
+GLint param_value;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_WIDTH, ¶m_value);
+info.w = param_value;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_HEIGHT, ¶m_value);
+info.h = param_value;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_DEPTH, ¶m_value);
+info.depth = 1;
+info.pitch = stride;
+uint32_t tiling_mode, swizzle_mode;
+drm_intel_bo_get_tiling(intel_bo, &tiling_mode, &swizzle_mode);
+info.offset = offset;
+info.tile_x = 0;
+info.tile_y = 0;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_INTERNAL_FORMAT, ¶m_value);
+info.gl_format = param_value;
+info.row_pitch = stride;
+info.slice_pitch = 0;
+
+info.tiling = get_cl_tiling(tiling_mode);
+if (cl_get_clformat_from_texture(info.gl_format, &cl_format) != 0)
+ goto error;
+
+if (cl_image_byte_per_pixel(&cl_format, &bpp) != CL_SUCCESS)
+ goto error;
+intel_fmt = cl_image_get_intel_format(&cl_format);
+if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
+ goto error;
+cl_mem_object_type image_type;
+if (get_mem_type_from_target(target, &image_type) != 0)
+ goto error;
+
+cl_mem_image_init(image, info.w, info.h,
+ image_type, info.depth, cl_format,
+ intel_fmt, bpp, info.row_pitch,
+ info.slice_pitch, info.tiling,
+ info.tile_x, info.tile_y, info.offset);
+
+struct _cl_mem_gl_image *gl_image = (struct _cl_mem_gl_image*)image;
+gl_image->fd = fd;
+gl_image->egl_image = e_image;
+
+return (cl_buffer) intel_bo;
+
+error:
+drm_intel_bo_unreference(intel_bo);
+close(fd);
+eglDestroyImage(EGL_DISP(ctx), e_image);
+return NULL;
+}
+
static cl_buffer
intel_alloc_buffer_from_texture(cl_context ctx, unsigned int target,
- int miplevel, unsigned int texture,
- struct _cl_mem_image *image)
+ int miplevel, unsigned int texture,
+ struct _cl_mem_image *image)
{
- if (IS_EGL_CONTEXT(ctx))
- return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
+if (IS_EGL_CONTEXT(ctx))
+ return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
- return NULL;
+return NULL;
}
static int
-intel_release_buffer_from_texture(cl_context ctx, unsigned int target,
- int miplevel, unsigned int texture)
+intel_release_buffer_from_texture(cl_context ctx, struct _cl_mem_gl_image *gl_image)
{
- if (IS_EGL_CONTEXT(ctx)) {
- EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
- EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
- EGL_GL_TEXTURE_TARGET_MESA, target,
- EGL_NONE};
-
- eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
- return CL_SUCCESS;
- }
- return -1;
+if (IS_EGL_CONTEXT(ctx)) {
+ close(gl_image->fd);
+ eglDestroyImage(EGL_DISP(ctx), gl_image->egl_image);
+ return CL_SUCCESS;
+}
+return -1;
}
#endif
cl_buffer intel_share_buffer_from_libva(cl_context ctx,
- unsigned int bo_name,
- size_t *sz)
+ unsigned int bo_name,
+ size_t *sz)
{
- drm_intel_bo *intel_bo;
+drm_intel_bo *intel_bo;
- intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
- if (intel_bo == NULL)
- return NULL;
+if (intel_bo == NULL)
+ return NULL;
- if (sz)
- *sz = intel_bo->size;
+if (sz)
+ *sz = intel_bo->size;
- return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
}
cl_buffer intel_share_image_from_libva(cl_context ctx,
- unsigned int bo_name,
- struct _cl_mem_image *image)
+ unsigned int bo_name,
+ struct _cl_mem_image *image)
{
- drm_intel_bo *intel_bo;
- uint32_t intel_tiling, intel_swizzle_mode;
+drm_intel_bo *intel_bo;
+uint32_t intel_tiling, intel_swizzle_mode;
- intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
- if (intel_bo == NULL)
- return NULL;
+if (intel_bo == NULL)
+ return NULL;
- drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
- image->tiling = get_cl_tiling(intel_tiling);
+drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+image->tiling = get_cl_tiling(intel_tiling);
- return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
}
cl_buffer intel_share_buffer_from_fd(cl_context ctx,
- int fd,
- int buffer_size)
+ int fd,
+ int buffer_size)
{
- drm_intel_bo *intel_bo;
+drm_intel_bo *intel_bo;
- intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, buffer_size);
+intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, buffer_size);
- if (intel_bo == NULL)
- return NULL;
+if (intel_bo == NULL)
+ return NULL;
- return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
}
cl_buffer intel_share_image_from_fd(cl_context ctx,
- int fd,
- int image_size,
- struct _cl_mem_image *image)
+ int fd,
+ int image_size,
+ struct _cl_mem_image *image)
{
- drm_intel_bo *intel_bo;
- uint32_t intel_tiling, intel_swizzle_mode;
+drm_intel_bo *intel_bo;
+uint32_t intel_tiling, intel_swizzle_mode;
- intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, image_size);
+intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, image_size);
- if (intel_bo == NULL)
- return NULL;
+if (intel_bo == NULL)
+ return NULL;
- drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
- image->tiling = get_cl_tiling(intel_tiling);
+drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+image->tiling = get_cl_tiling(intel_tiling);
- return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
}
static cl_buffer intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char* name, void *data,size_t size, unsigned long flags)
{
#ifdef HAS_USERPTR
- drm_intel_bo *bo;
- bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags);
- /* Fallback to unsynchronized userptr allocation if kernel has no MMU notifier enabled. */
- if (bo == NULL)
- bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
- return (cl_buffer)bo;
+drm_intel_bo *bo;
+bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags);
+/* Fallback to unsynchronized userptr allocation if kernel has no MMU notifier enabled. */
+if (bo == NULL)
+ bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
+return (cl_buffer)bo;
#else
- return NULL;
+return NULL;
#endif
}
static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
{
- switch (tiling) {
- case CL_NO_TILE:
- *intel_tiling = I915_TILING_NONE;
- break;
- case CL_TILE_X:
- *intel_tiling = I915_TILING_X;
- break;
- case CL_TILE_Y:
- *intel_tiling = I915_TILING_Y;
- break;
- default:
- assert(0);
- return -1;
- }
- return 0;
+switch (tiling) {
+ case CL_NO_TILE:
+ *intel_tiling = I915_TILING_NONE;
+ break;
+ case CL_TILE_X:
+ *intel_tiling = I915_TILING_X;
+ break;
+ case CL_TILE_Y:
+ *intel_tiling = I915_TILING_Y;
+ break;
+ default:
+ assert(0);
+ return -1;
+}
+return 0;
}
static int intel_buffer_set_tiling(cl_buffer bo,
- cl_image_tiling_t tiling, size_t stride)
+ cl_image_tiling_t tiling, size_t stride)
{
- uint32_t intel_tiling;
- int ret;
- if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
- return -1;
+uint32_t intel_tiling;
+int ret;
+if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
+ return -1;
#ifndef NDEBUG
- uint32_t required_tiling;
- required_tiling = intel_tiling;
+uint32_t required_tiling;
+required_tiling = intel_tiling;
#endif
- ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
- assert(intel_tiling == required_tiling);
- return ret;
+ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
+assert(intel_tiling == required_tiling);
+return ret;
}
#define CHV_CONFIG_WARNING \
- "Warning: can't get GPU's configurations, will use the minimal one. Please update your drm to 2.4.59+ and linux kernel to 4.0.0+.\n"
+ "Warning: can't get GPU's configurations, will use the minimal one. Please update your drm to 2.4.59+ and linux kernel to 4.0.0+.\n"
static void
intel_update_device_info(cl_device_id device)
{
- intel_driver_t *driver;
+intel_driver_t *driver;
- driver = intel_driver_new();
- assert(driver != NULL);
- if (intel_driver_open(driver, NULL) != CL_SUCCESS) {
- intel_driver_delete(driver);
- return;
- }
+driver = intel_driver_new();
+assert(driver != NULL);
+if (intel_driver_open(driver, NULL) != CL_SUCCESS) {
+ intel_driver_delete(driver);
+ return;
+}
#ifdef HAS_USERPTR
- const size_t sz = 4096;
- void *host_ptr;
-
- host_ptr = cl_aligned_malloc(sz, 4096);
- if (host_ptr != NULL) {
- cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
- "CL memory object", host_ptr, sz, 0);
- if (bo == NULL)
- device->host_unified_memory = CL_FALSE;
- else
- drm_intel_bo_unreference((drm_intel_bo*)bo);
- cl_free(host_ptr);
- }
- else
+const size_t sz = 4096;
+void *host_ptr;
+
+host_ptr = cl_aligned_malloc(sz, 4096);
+if (host_ptr != NULL) {
+ cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
+ "CL memory object", host_ptr, sz, 0);
+ if (bo == NULL)
device->host_unified_memory = CL_FALSE;
+ else
+ drm_intel_bo_unreference((drm_intel_bo*)bo);
+ cl_free(host_ptr);
+}
+else
+ device->host_unified_memory = CL_FALSE;
#endif
#ifdef HAS_EU_TOTAL
- unsigned int eu_total;
+unsigned int eu_total;
- /* Prefer driver-queried max compute units if supported */
- if (!drm_intel_get_eu_total(driver->fd, &eu_total))
- device->max_compute_unit = eu_total;
- else if (IS_CHERRYVIEW(device->device_id))
- printf(CHV_CONFIG_WARNING);
+/* Prefer driver-queried max compute units if supported */
+if (!drm_intel_get_eu_total(driver->fd, &eu_total))
+ device->max_compute_unit = eu_total;
+else if (IS_CHERRYVIEW(device->device_id))
+ printf(CHV_CONFIG_WARNING);
#else
- if (IS_CHERRYVIEW(device->device_id)) {
+if (IS_CHERRYVIEW(device->device_id)) {
#if defined(__ANDROID__)
- device->max_compute_unit = 12;
+ device->max_compute_unit = 12;
#else
- printf(CHV_CONFIG_WARNING);
+ printf(CHV_CONFIG_WARNING);
#endif
- }
+}
#endif
#ifdef HAS_SUBSLICE_TOTAL
- unsigned int subslice_total;
+unsigned int subslice_total;
- /* Prefer driver-queried subslice count if supported */
- if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
- device->sub_slice_count = subslice_total;
- else if (IS_CHERRYVIEW(device->device_id))
- printf(CHV_CONFIG_WARNING);
+/* Prefer driver-queried subslice count if supported */
+if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
+ device->sub_slice_count = subslice_total;
+else if (IS_CHERRYVIEW(device->device_id))
+ printf(CHV_CONFIG_WARNING);
#else
- if (IS_CHERRYVIEW(device->device_id)) {
+if (IS_CHERRYVIEW(device->device_id)) {
#if defined(__ANDROID__)
- device->sub_slice_count = 2;
+ device->sub_slice_count = 2;
#else
- printf(CHV_CONFIG_WARNING);
+ printf(CHV_CONFIG_WARNING);
#endif
- }
+}
#endif
#ifdef HAS_POOLED_EU
- /* BXT pooled eu, 3*6 to 2*9, like sub slice count is 2 */
- int has_pooled_eu;
- if((has_pooled_eu = drm_intel_get_pooled_eu(driver->fd)) > 0)
- device->sub_slice_count = 2;
+/* BXT pooled eu, 3*6 to 2*9, like sub slice count is 2 */
+int has_pooled_eu;
+if((has_pooled_eu = drm_intel_get_pooled_eu(driver->fd)) > 0)
+ device->sub_slice_count = 2;
#ifdef HAS_MIN_EU_IN_POOL
- int min_eu;
- /* for fused down 2x6 devices, beignet don't support. */
- if (has_pooled_eu > 0 && (min_eu = drm_intel_get_min_eu_in_pool(driver->fd)) > 0) {
- assert(min_eu == 9); //don't support fuse down device.
- }
+int min_eu;
+/* for fused down 2x6 devices, beignet don't support. */
+if (has_pooled_eu > 0 && (min_eu = drm_intel_get_min_eu_in_pool(driver->fd)) > 0) {
+ assert(min_eu == 9); //don't support fuse down device.
+}
#endif //HAS_MIN_EU_IN_POOL
#endif //HAS_POOLED_EU
- //We should get the device memory dynamically, but the
- //mapablce mem size usage is unknown. Just ignore it.
- size_t total_mem,map_mem;
- if(drm_intel_get_aperture_sizes(driver->fd,&map_mem,&total_mem) == 0)
- device->global_mem_size = (cl_ulong)total_mem;
-
- intel_driver_context_destroy(driver);
- intel_driver_close(driver);
- intel_driver_terminate(driver);
- intel_driver_delete(driver);
+//We should get the device memory dynamically, but the
+//mapablce mem size usage is unknown. Just ignore it.
+size_t total_mem,map_mem;
+if(drm_intel_get_aperture_sizes(driver->fd,&map_mem,&total_mem) == 0)
+ device->global_mem_size = (cl_ulong)total_mem;
+
+intel_driver_context_destroy(driver);
+intel_driver_close(driver);
+intel_driver_terminate(driver);
+intel_driver_delete(driver);
}
LOCAL void
intel_setup_callbacks(void)
{
- cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
- cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
- cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
- cl_driver_enlarge_stack_size = (cl_driver_enlarge_stack_size_cb *) intel_driver_enlarge_stack_size;
- cl_driver_set_atomic_flag = (cl_driver_set_atomic_flag_cb *) intel_driver_set_atomic_flag;
- cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
- cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
- cl_driver_update_device_info = (cl_driver_update_device_info_cb *) intel_update_device_info;
- cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
- cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
+cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
+cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
+cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+cl_driver_enlarge_stack_size = (cl_driver_enlarge_stack_size_cb *) intel_driver_enlarge_stack_size;
+cl_driver_set_atomic_flag = (cl_driver_set_atomic_flag_cb *) intel_driver_set_atomic_flag;
+cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
+cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
+cl_driver_update_device_info = (cl_driver_update_device_info_cb *) intel_update_device_info;
+cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
+#ifdef HAS_BO_SET_SOFTPIN
+cl_buffer_set_softpin_offset = (cl_buffer_set_softpin_offset_cb *) drm_intel_bo_set_softpin_offset;
+cl_buffer_set_bo_use_full_range = (cl_buffer_set_bo_use_full_range_cb *) drm_intel_bo_use_48b_address_range;
+#endif
+ cl_buffer_disable_reuse = (cl_buffer_disable_reuse_cb *) drm_intel_bo_disable_reuse;
cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
-#if defined(HAS_EGL)
+#if defined(HAS_GL_EGL)
cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
cl_buffer_release_from_texture = (cl_buffer_release_from_texture_cb *) intel_release_buffer_from_texture;
- intel_set_cl_gl_callbacks();
#endif
cl_buffer_get_buffer_from_libva = (cl_buffer_get_buffer_from_libva_cb *) intel_share_buffer_from_libva;
cl_buffer_get_image_from_libva = (cl_buffer_get_image_from_libva_cb *) intel_share_image_from_libva;
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index 51f0e0d..3be93c2 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -79,6 +79,7 @@ typedef struct intel_driver
{
dri_bufmgr *bufmgr;
drm_intel_context *ctx;
+ drm_intel_bo *null_bo;
int fd;
int device_id;
int gen_ver;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index a643f5c..283b07a 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -975,6 +975,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
size_aux = ALIGN(size_aux, 4096);
bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 4096);
+
if (!bo || dri_bo_map(bo, 1) != 0) {
fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
if (bo)
@@ -1527,10 +1528,12 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
uint32_t internal_offset, size_t size, uint8_t bti)
{
assert(gpgpu->binded_n < max_buf_n);
- gpgpu->binded_buf[gpgpu->binded_n] = buf;
- gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
- gpgpu->binded_offset[gpgpu->binded_n] = offset;
- gpgpu->binded_n++;
+ if(offset != -1) {
+ gpgpu->binded_buf[gpgpu->binded_n] = buf;
+ gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
+ gpgpu->binded_offset[gpgpu->binded_n] = offset;
+ gpgpu->binded_n++;
+ }
intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW);
}
@@ -1710,7 +1713,38 @@ intel_gpgpu_build_idrt_gen9(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
}
static int
-intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+intel_gpgpu_upload_curbes_gen7(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+{
+ unsigned char *curbe = NULL;
+ cl_gpgpu_kernel *k = gpgpu->ker;
+ uint32_t i, j;
+
+ /* Upload the data first */
+ if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) {
+ fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+ return -1;
+ }
+ assert(gpgpu->aux_buf.bo->virtual);
+ curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset);
+ memcpy(curbe, data, size);
+
+ /* Now put all the relocations for our flat address space */
+ for (i = 0; i < k->thread_n; ++i)
+ for (j = 0; j < gpgpu->binded_n; ++j) {
+ *(uint32_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j];
+ drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
+ gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
+ gpgpu->binded_buf[j],
+ gpgpu->target_buf_offset[j],
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER);
+ }
+ dri_bo_unmap(gpgpu->aux_buf.bo);
+ return 0;
+}
+
+static int
+intel_gpgpu_upload_curbes_gen8(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
{
unsigned char *curbe = NULL;
cl_gpgpu_kernel *k = gpgpu->ker;
@@ -1728,7 +1762,7 @@ intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
/* Now put all the relocations for our flat address space */
for (i = 0; i < k->thread_n; ++i)
for (j = 0; j < gpgpu->binded_n; ++j) {
- *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset + gpgpu->target_buf_offset[j];
+ *(size_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j];
drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
gpgpu->binded_buf[j],
@@ -2050,6 +2084,9 @@ static void
intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
{
gpgpu->ker = kernel;
+ if (gpgpu->drv->null_bo)
+ intel_gpgpu_setup_bti(gpgpu, gpgpu->drv->null_bo, 0, 64*1024, 0xfe, I965_SURFACEFORMAT_RAW);
+
intel_gpgpu_build_idrt(gpgpu, kernel);
dri_bo_unmap(gpgpu->aux_buf.bo);
}
@@ -2068,6 +2105,7 @@ intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
+ const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
@@ -2117,6 +2155,7 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
+ const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
@@ -2144,14 +2183,14 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
else
OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */
+ OUT_BATCH(gpgpu->batch, global_dim_off[0]);
OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[0]+global_dim_off[0]);
+ OUT_BATCH(gpgpu->batch, global_dim_off[1]);
OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
- OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
- OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[1]+global_dim_off[1]);
+ OUT_BATCH(gpgpu->batch, global_dim_off[2]);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[2]+global_dim_off[2]);
OUT_BATCH(gpgpu->batch, right_mask);
OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
ADVANCE_BATCH(gpgpu->batch);
@@ -2269,10 +2308,10 @@ intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
/* We want to get the current time of GPU. */
static void
-intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_driver_t* gen_driver, uint64_t* ret_ts)
{
uint64_t result = 0;
- drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+ drm_intel_bufmgr *bufmgr = gen_driver->bufmgr;
/* Get the ts that match the bspec */
result = intel_gpgpu_read_ts_reg(bufmgr);
@@ -2284,15 +2323,13 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
/* Get the GPU execute time. */
static void
-intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
- int index, uint64_t* ret_ts)
+intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, int index, uint64_t* ret_ts)
{
uint64_t result = 0;
-
- assert(event->ts_buf != NULL);
+ assert(gpgpu->time_stamp_b.bo);
assert(index == 0 || index == 1);
- drm_intel_gem_bo_map_gtt(event->ts_buf);
- uint64_t* ptr = event->ts_buf->virtual;
+ drm_intel_gem_bo_map_gtt(gpgpu->time_stamp_b.bo);
+ uint64_t* ptr = gpgpu->time_stamp_b.bo->virtual;
result = ptr[index];
/* According to BSpec, the timestamp counter should be 36 bits,
@@ -2303,7 +2340,7 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
*ret_ts = result;
- drm_intel_gem_bo_unmap_gtt(event->ts_buf);
+ drm_intel_gem_bo_unmap_gtt(gpgpu->time_stamp_b.bo);
}
static int
@@ -2409,6 +2446,18 @@ intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu)
return gpgpu->printf_info;
}
+static void
+intel_gpgpu_set_kernel(intel_gpgpu_t *gpgpu, void * kernel)
+{
+ gpgpu->kernel = kernel;
+}
+
+static void*
+intel_gpgpu_get_kernel(intel_gpgpu_t *gpgpu)
+{
+ return gpgpu->kernel;
+}
+
LOCAL void
intel_set_gpgpu_callbacks(int device_id)
{
@@ -2419,7 +2468,6 @@ intel_set_gpgpu_callbacks(int device_id)
cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
- cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
@@ -2449,6 +2497,8 @@ intel_set_gpgpu_callbacks(int device_id)
cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
+ cl_gpgpu_set_kernel = (cl_gpgpu_set_kernel_cb *)intel_gpgpu_set_kernel;
+ cl_gpgpu_get_kernel = (cl_gpgpu_get_kernel_cb *)intel_gpgpu_get_kernel;
if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8;
@@ -2468,7 +2518,8 @@ intel_set_gpgpu_callbacks(int device_id)
intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
- intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
+ intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
+ cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8;
return;
}
if (IS_GEN9(device_id)) {
@@ -2488,9 +2539,11 @@ intel_set_gpgpu_callbacks(int device_id)
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen9;
+ cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8;
return;
}
+ cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen7;
intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7;
intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen7;
cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen7;
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index 904f9e0..f575f8b 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -53,6 +53,7 @@ struct intel_gpgpu
uint32_t target_buf_offset[max_buf_n];/* internal offset for buffers binded for the call */
uint32_t binded_offset[max_buf_n]; /* their offsets in the curbe buffer */
uint32_t binded_n; /* number of buffers binded */
+ void *kernel; /* cl_kernel with this gpgpu */
unsigned long img_bitmap; /* image usage bitmap. */
unsigned int img_index_base; /* base index for image surface.*/
diff --git a/src/performance.c b/src/performance.c
index 28bd6c6..1e676c3 100644
--- a/src/performance.c
+++ b/src/performance.c
@@ -54,6 +54,8 @@ static context_storage_node * find_context(cl_context context)
if(NULL == record.context_storage)
{
record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+ if (record.context_storage == NULL)
+ return NULL;
record.context_storage->context_id = (uintptr_t)context;
record.context_storage->kernels_storage = NULL;
record.context_storage->kernel_max_time = 0.0f;
@@ -96,6 +98,8 @@ static kernel_storage_node * find_kernel(context_storage_node *p_context, const
if(NULL == p_context->kernels_storage)
{
p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+ if (p_context->kernels_storage == NULL)
+ return NULL;
p_context->kernel_count++;
strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
@@ -188,6 +192,8 @@ static void print_time_info()
kernel_storage_node *p_kernel = p_context->kernels_storage;
kernel_storage_node *p_tmp_kernel = p_kernel;
time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+ if (te == NULL)
+ return;
memset(te, 0, sizeof(time_element)*p_context->kernel_count);
int i = -1, j = 0, k = 0;
while(NULL != p_tmp_kernel)
diff --git a/src/x11/mesa_egl_extension.c b/src/x11/mesa_egl_extension.c
deleted file mode 100644
index 4a3e89c..0000000
--- a/src/x11/mesa_egl_extension.c
+++ /dev/null
@@ -1,306 +0,0 @@
-#include <stdio.h>
-#include "mesa_egl_extension.h"
-#include "mesa_egl_res_share.h"
-#include "src/cl_driver.h"
-
-struct _egl_display;
-struct _egl_resource;
-struct _egl_thread_info;
-struct _egl_config;
-struct _egl_surface;
-struct _egl_driver;
-
-typedef struct _egl_display _EGLDisplay;
-typedef struct _egl_resource _EGLResource;
-typedef struct _egl_thread_info _EGLThreadInfo;
-typedef struct _egl_config _EGLConfig;
-typedef struct _egl_surface _EGLSurface;
-typedef struct _egl_driver _EGLDriver;
-
-/**
- * A resource of a display.
- */
-struct _egl_resource
-{
- /* which display the resource belongs to */
- _EGLDisplay *Display;
- EGLBoolean IsLinked;
- EGLint RefCount;
-
- /* used to link resources of the same type */
- _EGLResource *Next;
-};
-
-/**
- * "Base" class for device driver contexts.
- */
-struct _egl_context
-{
- /* A context is a display resource */
- _EGLResource Resource;
-
- /* The bound status of the context */
- _EGLThreadInfo *Binding;
- _EGLSurface *DrawSurface;
- _EGLSurface *ReadSurface;
-
- _EGLConfig *Config;
-
- EGLint ClientAPI; /**< EGL_OPENGL_ES_API, EGL_OPENGL_API, EGL_OPENVG_API */
- EGLint ClientMajorVersion;
- EGLint ClientMinorVersion;
- EGLint Flags;
- EGLint Profile;
- EGLint ResetNotificationStrategy;
-
- /* The real render buffer when a window surface is bound */
- EGLint WindowRenderBuffer;
-};
-
-typedef struct _egl_context _EGLContext;
-
-struct dri2_egl_display
-{
- int dri2_major;
- int dri2_minor;
- __DRIscreen *dri_screen;
- int own_dri_screen;
- const __DRIconfig **driver_configs;
- void *driver;
-};
-
-enum _egl_platform_type {
- _EGL_PLATFORM_WINDOWS,
- _EGL_PLATFORM_X11,
- _EGL_PLATFORM_WAYLAND,
- _EGL_PLATFORM_DRM,
- _EGL_PLATFORM_FBDEV,
- _EGL_PLATFORM_NULL,
- _EGL_PLATFORM_ANDROID,
-
- _EGL_NUM_PLATFORMS,
- _EGL_INVALID_PLATFORM = -1
-};
-typedef enum _egl_platform_type _EGLPlatformType;
-
-typedef pthread_mutex_t _EGLMutex;
-
-struct _egl_display
-{
- /* used to link displays */
- _EGLDisplay *Next;
-
- _EGLMutex Mutex;
-
- _EGLPlatformType Platform; /**< The type of the platform display */
- void *PlatformDisplay; /**< A pointer to the platform display */
-
- _EGLDriver *Driver; /**< Matched driver of the display */
- EGLBoolean Initialized; /**< True if the display is initialized */
-
- /* options that affect how the driver initializes the display */
- struct {
- EGLBoolean TestOnly; /**< Driver should not set fields when true */
- EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
- } Options;
-
- /* these fields are set by the driver during init */
- void *DriverData; /**< Driver private data */
-};
-
-static struct dri2_egl_display *
-dri2_egl_display(_EGLDisplay *dpy)
-{
- return (struct dri2_egl_display *)dpy->DriverData;
-}
-
-static _EGLDisplay *
-_eglLockDisplay(EGLDisplay dpy)
-{
- return (_EGLDisplay *)dpy;
-}
-
-static _EGLContext *
-_eglLookupContext(EGLContext ctx, EGLDisplay disp)
-{
- return (_EGLContext *) ctx;
-}
-
-struct dri2_egl_context
-{
- _EGLContext base;
- __DRIcontext *dri_context;
-};
-
-static struct dri2_egl_context *
-dri2_egl_context(_EGLContext *ctx)
-{
- return (struct dri2_egl_context *)ctx;
-}
-
-static EGLBoolean
-dri2_acquire_texture(_EGLDisplay *disp,
- _EGLContext *ctx,
- const EGLint *attr_list,
- void *user_data)
-{
- struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
- struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
- GLuint texture = 0;
- GLenum gl_target = 0;
- GLint level = 0;
- GLboolean ret;
-
- if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
- return EGL_FALSE;
-
- ret = cl_gl_acquire_texture(dri2_dpy->driver,
- dri2_ctx->dri_context,
- gl_target, level, texture,
- user_data);
- return ret;
-}
-
-static EGLBoolean
-dri2_release_texture(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
-{
- struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
- struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
- GLuint texture = 0;
- GLenum gl_target = 0;
- GLint level = 0;
- GLboolean ret;
-
- if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
- return EGL_FALSE;
-
- ret = cl_gl_release_texture(dri2_dpy->driver, dri2_ctx->dri_context,
- gl_target, level, texture);
- return ret;
-}
-
-static EGLBoolean
-dri2_acquire_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list,
- void *user_data)
-{
- struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
- struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
- GLuint bufobj = 0;
- GLboolean ret;
-
- if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
- return EGL_FALSE;
-
- ret = cl_gl_acquire_buffer_object(dri2_dpy->driver,
- dri2_ctx->dri_context,
- bufobj, user_data);
- return ret;
-}
-
-static EGLBoolean
-dri2_release_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
-{
- struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
- struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
- GLuint bufobj = 0;
- GLboolean ret;
-
- if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
- return EGL_FALSE;
-
- ret = cl_gl_release_buffer_object(dri2_dpy->driver,
- dri2_ctx->dri_context,
- bufobj);
- return ret;
-}
-
-static EGLBoolean
-dri2_acquire_render_buffer(_EGLDisplay *disp,
- _EGLContext *ctx,
- const EGLint *attr_list,
- void *user_data)
-{
- struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
- struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
- GLuint rb = 0;
- GLboolean ret;
-
- if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
- return EGL_FALSE;
-
- ret = cl_gl_acquire_render_buffer(dri2_dpy->driver,
- dri2_ctx->dri_context,
- rb, user_data);
- return ret;
-}
-
-static EGLBoolean
-dri2_release_render_buffer(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
-{
- struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
- struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
- GLuint rb = 0;
- GLboolean ret;
-
- if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
- return EGL_FALSE;
-
- ret = cl_gl_release_render_buffer(dri2_dpy->driver,
- dri2_ctx->dri_context,
- rb);
- return ret;
-}
-
-static EGLBoolean
-dri2_acquire_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
- const EGLint *attrib_list, void *user_data)
-{
- switch (target) {
- case EGL_GL_TEXTURE_MESA:
- return dri2_acquire_texture(disp, ctx, attrib_list, user_data);
- case EGL_GL_BUFFER_OBJECT_MESA:
- return dri2_acquire_buffer_object(disp, ctx, attrib_list, user_data);
- case EGL_GL_RENDER_BUFFER_MESA:
- return dri2_acquire_render_buffer(disp, ctx, attrib_list, user_data);
- default:
- fprintf(stderr, "bad resource target value 0x%04x",
- target);
- }
- return EGL_FALSE;
-}
-
-static EGLBoolean
-dri2_release_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
- const EGLint *attrib_list)
-{
- switch (target) {
- case EGL_GL_TEXTURE_MESA:
- return dri2_release_texture(disp, ctx, attrib_list);
- case EGL_GL_BUFFER_OBJECT_MESA:
- return dri2_release_buffer_object(disp, ctx, attrib_list);
- case EGL_GL_RENDER_BUFFER_MESA:
- return dri2_release_render_buffer(disp, ctx, attrib_list);
- default:
- fprintf(stderr, "bad resource target value 0x%04x",
- target);
- }
- return EGL_FALSE;
-}
-
-EGLBoolean
-eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void *user)
-{
- _EGLDisplay *disp = _eglLockDisplay(dpy);
- _EGLContext *context = _eglLookupContext(ctx, disp);
-
- return dri2_acquire_resource_mesa(disp, context, target, attrib_list, user);
-}
-
-EGLBoolean
-eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list)
-{
- _EGLDisplay *disp = _eglLockDisplay(dpy);
- _EGLContext *context = _eglLookupContext(ctx, disp);
-
- return dri2_release_resource_mesa(disp, context, target, attrib_list);
-}
diff --git a/src/x11/mesa_egl_extension.h b/src/x11/mesa_egl_extension.h
deleted file mode 100644
index 39ea134..0000000
--- a/src/x11/mesa_egl_extension.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef __MESA_EGL_EXTENSION_H__
-#define __MESA_EGL_EXTENSION_H__
-
-#include <EGL/egl.h>
-#include <GL/gl.h>
-#include <GL/internal/dri_interface.h>
-
-#define EGL_GL_TEXTURE_MESA 0x3300 /* eglAcuireResource target */
-#define EGL_GL_BUFFER_OBJECT_MESA 0x3301 /* eglAcuireResource target */
-#define EGL_GL_RENDER_BUFFER_MESA 0x3302 /* eglAcuireResource target */
-#define EGL_GL_TEXTURE_ID_MESA 0x3303 /* eglAcuireResource attribute */
-#define EGL_GL_TEXTURE_LEVEL_MESA 0x3304 /* eglAcuireResource attribute */
-#define EGL_GL_TEXTURE_TARGET_MESA 0x3305 /* eglAcuireResource attribute */
-#define EGL_GL_BUFFER_OBJECT_ID_MESA 0x3306 /* eglAcuireResource attribute */
-#define EGL_GL_RENDER_BUFFER_ID_MESA 0x3307 /* eglAcuireResource attribute */
-
-EGLBoolean eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void * user_data);
-EGLBoolean eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list);
-
-#endif
diff --git a/src/x11/mesa_egl_res_share.c b/src/x11/mesa_egl_res_share.c
deleted file mode 100644
index 93e9454..0000000
--- a/src/x11/mesa_egl_res_share.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
- * Copyright 2013-2014 Intel, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include <assert.h>
-#include <string.h>
-
-#include "mesa_egl_extension.h"
-#include "mesa_egl_res_share.h"
-
-/**
- * Parse the list of share texture attributes and return the proper error code.
- */
-EGLint
-_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target, EGLint *level,
- const EGLint *attrib_list)
-{
- EGLint i, err = EGL_SUCCESS;
-
- *texture = 0;
- *gl_target = 0;
- *level = 0;
-
- if (!attrib_list)
- return EGL_BAD_ATTRIBUTE;
-
- for (i = 0; attrib_list[i] != EGL_NONE; i++) {
- EGLint attr = attrib_list[i++];
- EGLint val = attrib_list[i];
-
- switch (attr) {
- case EGL_GL_TEXTURE_LEVEL_MESA:
- *level = val;
- break;
- case EGL_GL_TEXTURE_ID_MESA:
- *texture = val;
- break;
- case EGL_GL_TEXTURE_TARGET_MESA:
- *gl_target = val;
- break;
- default:
- /* unknown attrs are ignored */
- break;
- }
- }
-
- return err;
-}
-
-/**
- * Parse the list of share texture attributes and return the proper error code.
- */
-EGLint
-_eglParseBufferObjAttribList(unsigned int *bufobj, const EGLint *attrib_list)
-{
- EGLint i, err = EGL_SUCCESS;
- *bufobj = 0;
-
- if (!attrib_list)
- return EGL_BAD_ATTRIBUTE;
-
- for (i = 0; attrib_list[i] != EGL_NONE; i++) {
- EGLint attr = attrib_list[i++];
- EGLint val = attrib_list[i];
-
- switch (attr) {
- case EGL_GL_BUFFER_OBJECT_ID_MESA:
- *bufobj = val;
- break;
- default:
- /* unknown attrs are ignored */
- break;
- }
- }
- if (*bufobj == 0)
- err = EGL_BAD_ATTRIBUTE;
-
- return err;
-}
-
-/**
- * Parse the list of share texture attributes and return the proper error code.
- */
-EGLint
-_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list)
-{
- EGLint i, err = EGL_SUCCESS;
- *rb = 0;
-
- if (!attrib_list)
- return EGL_BAD_ATTRIBUTE;
-
- for (i = 0; attrib_list[i] != EGL_NONE; i++) {
- EGLint attr = attrib_list[i++];
- EGLint val = attrib_list[i];
-
- switch (attr) {
- case EGL_GL_RENDER_BUFFER_ID_MESA:
- *rb = val;
- break;
- default:
- /* unknown attrs are ignored */
- break;
- }
- }
- if (*rb == 0)
- err = EGL_BAD_ATTRIBUTE;
-
- return err;
-}
diff --git a/src/x11/mesa_egl_res_share.h b/src/x11/mesa_egl_res_share.h
deleted file mode 100644
index 43e746e..0000000
--- a/src/x11/mesa_egl_res_share.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
- * Copyright 2013-2014 Intel, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef EGLRESSHARE_INCLUDED
-#define EGLRESSHARE_INCLUDED
-
-#include <EGL/egl.h>
-
-EGLint
-_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target,
- EGLint *level, const EGLint *attrib_list);
-EGLint
-_eglParseBufferObjAttribList(unsigned int *bufobj,
- const EGLint *attrib_list);
-
-EGLint
-_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list);
-#endif
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 4957b7c..db61844 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -40,7 +40,10 @@ endif (NOT NOT_BUILD_STAND_ALONE_UTEST)
###################################################################################
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
- ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+ ${CMAKE_CURRENT_SOURCE_DIR}/../include
+ ${OPENGL_INCLUDE_DIRS}
+ ${EGL_INCLUDE_DIRS})
+
##### Math Function Part:
EXECUTE_PROCESS(COMMAND mkdir generated -p WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
@@ -58,7 +61,7 @@ configure_file (
)
#XXX only need GL if required
-link_directories (${LLVM_LIBRARY_DIR} ${OPENGL_LIBDIR} ${DRM_LIBDIR})
+link_directories (${LLVM_LIBRARY_DIR} ${OPENGL_LIBDIR} ${EGL_LIBDIR} ${X11_LIBDIR} ${DRM_LIBDIR})
set (utests_basic_sources
utest_error.c
@@ -159,6 +162,7 @@ set (utests_sources
compiler_switch.cpp
compiler_bswap.cpp
compiler_clz.cpp
+ compiler_ctz.cpp
compiler_math.cpp
compiler_atomic_functions.cpp
compiler_async_copy.cpp
@@ -281,6 +285,7 @@ set (utests_sources
compiler_sub_group_shuffle_xor.cpp
builtin_global_linear_id.cpp
builtin_local_linear_id.cpp
+ multi_queue_events.cpp
compiler_mix.cpp
compiler_math_3op.cpp
compiler_bsort.cpp
@@ -292,6 +297,19 @@ if (LLVM_VERSION_NODOT VERSION_GREATER 34)
compiler_overflow.cpp)
endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
+if (ENABLE_OPENCL_20)
+ SET(utests_sources
+ ${utests_sources}
+ compiler_program_global.cpp
+ compiler_generic_atomic.cpp
+ compiler_atomic_functions_20.cpp
+ compiler_sampler.cpp
+ compiler_generic_pointer.cpp
+ runtime_pipe_query.cpp
+ compiler_pipe_builtin.cpp
+ compiler_device_enqueue.cpp)
+endif (ENABLE_OPENCL_20)
+
if (NOT_BUILD_STAND_ALONE_UTEST)
if (X11_FOUND)
SET(utests_sources
@@ -339,13 +357,12 @@ add_custom_target(utest_generator
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
-if (EGL_FOUND AND MESA_SOURCE_FOUND)
+#compiler_fill_gl_image test case also need xlib
+if (OPENGL_FOUND AND EGL_FOUND AND X11_FOUND)
SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
- SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
- SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
- SET(UTESTS_REQUIRED_EGL_LIB ${EGL_LIBRARIES})
-else()
- SET(UTESTS_REQUIRED_EGL_LIB "")
+ SET(CMAKE_CXX_FLAGS "-DHAS_GL_EGL_X11 ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+ SET(CMAKE_C_FLAGS "-DHAS_GL_EGL_X11 ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+ SET(UTESTS_REQUIRED_GL_EGL_X11_LIB ${OPENGL_LIBRARIES} ${EGL_LIBRARIES} ${X11_LIBRARIES})
endif()
if (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
@@ -357,13 +374,15 @@ endif ()
if (COMPILER STREQUAL "CLANG")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-tautological-compare")
endif ()
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
+SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-declarations" )
ADD_LIBRARY(utests SHARED ${utests_sources})
if (NOT_BUILD_STAND_ALONE_UTEST)
- TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
+ TARGET_LINK_LIBRARIES(utests cl m ${UTESTS_REQUIRED_GL_EGL_X11_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
else()
- TARGET_LINK_LIBRARIES(utests ${OPENCL_LIBRARIES} m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
+ TARGET_LINK_LIBRARIES(utests ${OPENCL_LIBRARIES} m ${UTESTS_REQUIRED_GL_EGL_X11_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
endif()
ADD_EXECUTABLE(utest_run utest_run.cpp)
diff --git a/utests/compiler_atomic_functions_20.cpp b/utests/compiler_atomic_functions_20.cpp
new file mode 100644
index 0000000..ea1ace5
--- /dev/null
+++ b/utests/compiler_atomic_functions_20.cpp
@@ -0,0 +1,106 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+#include <string.h>
+
+#define GROUP_NUM 16
+#define LOCAL_SIZE 256
+static void cpu_compiler_atomic(int *dst, int *src)
+{
+ dst[4] = 0xffffffff;
+ int tmp[16] = { 0 };
+ tmp[4] = -1;
+ for(int j=0; j<LOCAL_SIZE; j++) {
+ int i = j % 12;
+
+ switch(i) {
+ case 0: tmp[i] += 1; break;
+ case 1: tmp[i] -= 1; break;
+ case 2: tmp[i] += src[j]; break;
+ case 3: tmp[i] -= src[j]; break;
+ case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
+ case 5: tmp[i] |= src[j]<<(j>>4); break;
+ case 6: tmp[i] ^= src[j]; break;
+ case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
+ case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
+ case 9: tmp[i] = (unsigned int)tmp[i] < (unsigned int)(-src[j]) ? tmp[i] : -src[j]; break;
+ case 10: tmp[i] = (unsigned int)tmp[i] > (unsigned int)(src[j]) ? tmp[i] : src[j]; break;
+ case 11: tmp[i] = src[10]; break;
+ default: break;
+ }
+ }
+
+ for(int k=0; k<GROUP_NUM; k++) {
+ for(int j=0; j<LOCAL_SIZE; j++) {
+ int i = j % 12;
+
+ switch(i) {
+ case 0: dst[i] += 1; break;
+ case 1: dst[i] -= 1; break;
+ case 2: dst[i] += src[j]; break;
+ case 3: dst[i] -= src[j]; break;
+ case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
+ case 5: dst[i] |= src[j]<<(j>>4); break;
+ case 6: dst[i] ^= src[j]; break;
+ case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
+ case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
+ case 9: dst[i] = (unsigned int)dst[i] < (unsigned int)(-src[j]) ? dst[i] : -src[j]; break;
+ case 10: dst[i] = (unsigned int)dst[i] > (unsigned int)(src[j]) ? dst[i] : src[j]; break;
+ case 11: dst[i] = src[10]; break;
+ default: break;
+ }
+ }
+ }
+
+ for(int i=0; i<12; i++)
+ dst[i+12] = tmp[i];
+}
+
+static void compiler_atomic_functions(const char* kernel_name)
+{
+ const size_t n = GROUP_NUM * LOCAL_SIZE;
+ int cpu_dst[24] = {0}, cpu_src[256];
+
+ globals[0] = n;
+ locals[0] = LOCAL_SIZE;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_atomic_functions_20", kernel_name);
+ OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, 16 * sizeof(int), NULL);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+
+ OCL_MAP_BUFFER(0);
+ memset(buf_data[0], 0, 24 * sizeof(int));
+ ((int *)buf_data[0])[4] = -1;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < locals[0]; ++i)
+ cpu_src[i] = ((int*)buf_data[1])[i] = rand() & 0xff;
+ cpu_compiler_atomic(cpu_dst, cpu_src);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for(int i=0; i<24; i++) {
+ //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
+ OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+
+#define compiler_atomic(kernel, version) \
+static void compiler_atomic_functions_##version()\
+{\
+ compiler_atomic_functions(kernel); \
+} \
+MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions_##version)
+
+compiler_atomic("compiler_atomic_functions_20", 20)
+
+
diff --git a/utests/compiler_ctz.cpp b/utests/compiler_ctz.cpp
new file mode 100644
index 0000000..d84fdad
--- /dev/null
+++ b/utests/compiler_ctz.cpp
@@ -0,0 +1,62 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+template<typename T>
+T get_max();
+
+template<typename U>
+void test(const char *kernel_name)
+{
+ const size_t n = 65;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_ctz", kernel_name);
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(U), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(U), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i) {
+ ((U*)buf_data[0])[i] = 1ll << i;
+ if(i == sizeof(U)*8)
+ ((U*)buf_data[0])[i] = 0;
+ }
+
+ OCL_UNMAP_BUFFER(0);
+
+ globals[0] = n;
+ locals[0] = 1;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i) {
+ if(sizeof(U) == 1 && i <= 8 )
+ OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+ else if(sizeof(U) == 2 && i <= 16 )
+ OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+ else if(sizeof(U) == 4 && i <= 32 )
+ OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+ else if(sizeof(U) == 8 && i <= 64 )
+ OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+ }
+ OCL_UNMAP_BUFFER(1);
+
+}
+}
+
+#define compiler_ctz(type, kernel)\
+static void compiler_ctz_ ##type(void)\
+{\
+ test<type>(# kernel);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_ctz_ ## type);
+
+compiler_ctz(uint64_t, compiler_ctz_ulong)
+compiler_ctz(uint32_t, compiler_ctz_uint)
+compiler_ctz(uint16_t, compiler_ctz_ushort)
+compiler_ctz(uint8_t, compiler_ctz_uchar)
+compiler_ctz(int64_t, compiler_ctz_long)
+compiler_ctz(int32_t, compiler_ctz_int)
+compiler_ctz(int16_t, compiler_ctz_short)
+compiler_ctz(int8_t, compiler_ctz_char)
diff --git a/utests/compiler_device_enqueue.cpp b/utests/compiler_device_enqueue.cpp
new file mode 100644
index 0000000..a9e3e2d
--- /dev/null
+++ b/utests/compiler_device_enqueue.cpp
@@ -0,0 +1,36 @@
+#include "utest_helper.hpp"
+
+void compiler_device_enqueue(void)
+{
+ const size_t n = 32;
+ const uint32_t global_sz = 3;
+ uint32_t result = 0;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_device_enqueue");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(uint32_t), &global_sz);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+
+ OCL_MAP_BUFFER(0);
+ for(uint32_t i = 0; i < 69; ++i)
+ ((short *)buf_data[0])[i] = 0;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ for(uint32_t i = 0; i < global_sz; ++i) {
+ result += i;
+ }
+ result *= global_sz;
+
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == result);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_device_enqueue);
diff --git a/utests/compiler_fill_gl_image.cpp b/utests/compiler_fill_gl_image.cpp
index f1eb8e7..b9d74d1 100644
--- a/utests/compiler_fill_gl_image.cpp
+++ b/utests/compiler_fill_gl_image.cpp
@@ -1,36 +1,14 @@
#include "utest_helper.hpp"
-static void read_back(int tex, int width, int height, uint32_t * resultColor)
-{
- float vertices[8] = {-1, 1, 1, 1, 1, -1, -1, -1};
- float tex_coords[8] = {0, 0, 1, 0, 1, 1, 0, 1};
-
- glBindTexture(GL_TEXTURE_2D, tex);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glEnable(GL_TEXTURE_2D);
- glDisable(GL_BLEND);
- glVertexPointer(2, GL_FLOAT, sizeof(float) * 2, vertices);
- glEnableClientState(GL_VERTEX_ARRAY);
- glClientActiveTexture(GL_TEXTURE0);
- glTexCoordPointer(2, GL_FLOAT, sizeof(float) * 2, tex_coords);
- glEnableClientState(GL_TEXTURE_COORD_ARRAY);
- glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
- glFlush();
- OCL_SWAP_EGL_BUFFERS();
-
- glReadPixels(0, 0, width, height, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor);
-}
-
static void compiler_fill_gl_image(void)
{
const size_t w = EGL_WINDOW_WIDTH;
const size_t h = EGL_WINDOW_HEIGHT;
- uint32_t color = 0x123456FF;
- uint32_t *resultColor;
+ uint32_t color0 = 0x123456FF;
+ uint32_t color1 = 0x789ABCDE;
+ uint32_t *resultColor0;
+ uint32_t *resultColor1;
GLuint tex;
if (eglContext == EGL_NO_CONTEXT) {
@@ -44,13 +22,15 @@ static void compiler_fill_gl_image(void)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, NULL);
+ glGenerateMipmap(GL_TEXTURE_2D);
+ glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, w/2, h/2, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, NULL);
OCL_CREATE_KERNEL("test_fill_gl_image");
+ //Create cl image from miplevel 0
OCL_CREATE_GL_IMAGE(buf[0], 0, GL_TEXTURE_2D, 0, tex);
-
// Run the kernel
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
- OCL_SET_ARG(1, sizeof(color), &color);
+ OCL_SET_ARG(1, sizeof(color0), &color0);
globals[0] = w;
globals[1] = h;
locals[0] = 16;
@@ -59,18 +39,37 @@ static void compiler_fill_gl_image(void)
OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(0);
OCL_NDRANGE(2);
OCL_FLUSH();
+ OCL_ENQUEUE_RELEASE_GL_OBJECTS(0);
// Check result
- resultColor = new uint32_t[w * h * 4];
- if (resultColor == NULL)
+ resultColor0 = new uint32_t[w * h];
+ if (resultColor0 == NULL)
assert(0);
-
- read_back(tex, w, h, resultColor);
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor0);
for (uint32_t j = 0; j < h; ++j)
for (uint32_t i = 0; i < w; i++)
- OCL_ASSERT(resultColor[j * w + i] == color);
- OCL_UNMAP_BUFFER(0);
- delete[] resultColor;
+ OCL_ASSERT(resultColor0[j * w + i] == color0);
+
+
+ //Create cl image from miplevel 1
+ OCL_CREATE_GL_IMAGE(buf[1], 0, GL_TEXTURE_2D, 1, tex);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(1, sizeof(color1), &color1);
+ globals[0] = w/2;
+ globals[1] = h/2;
+ OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(1);
+ OCL_NDRANGE(2);
+ OCL_FLUSH();
+ OCL_ENQUEUE_RELEASE_GL_OBJECTS(1);
+
+ // Check result
+ resultColor1 = new uint32_t[(w/2)*(h/2)];
+ glGetTexImage(GL_TEXTURE_2D, 1, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor1);
+ for (uint32_t j = 0; j < h/2; ++j)
+ for (uint32_t i = 0; i < w/2; i++)
+ OCL_ASSERT(resultColor1[j * (w/2) + i] == color1);
+ delete[] resultColor0;
+ delete[] resultColor1;
}
MAKE_UTEST_FROM_FUNCTION(compiler_fill_gl_image);
diff --git a/utests/compiler_generic_atomic.cpp b/utests/compiler_generic_atomic.cpp
new file mode 100644
index 0000000..9ed5f53
--- /dev/null
+++ b/utests/compiler_generic_atomic.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+template<typename T>
+void test_atomic(const char* kernelName)
+{
+ const int n = 16;
+ T cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CALL(cl_kernel_init, "compiler_generic_atomic.cl", kernelName, SOURCE, "-cl-std=CL2.0");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i)
+ cpu_src[i] = ((T*)buf_data[0])[i] = (T)i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < n; ++i) {
+// printf("i=%d dst=%d\n", i, ((T*)buf_data[1])[i]);
+ OCL_ASSERT(((T*)buf_data[1])[i] == 2 * cpu_src[i]);
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+#define GENERIC_ATOMIC_TEST(T) \
+void compiler_generic_atomic_##T() { \
+ test_atomic<T>("compiler_generic_atomic_"#T); \
+} \
+MAKE_UTEST_FROM_FUNCTION(compiler_generic_atomic_##T);
+
+GENERIC_ATOMIC_TEST(int)
+//GENERIC_TEST(long)
+
+
+
diff --git a/utests/compiler_generic_pointer.cpp b/utests/compiler_generic_pointer.cpp
new file mode 100644
index 0000000..5984694
--- /dev/null
+++ b/utests/compiler_generic_pointer.cpp
@@ -0,0 +1,46 @@
+#include "utest_helper.hpp"
+
+template<typename T>
+void test(const char* kernelName)
+{
+ const int n = 16;
+ T cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CALL(cl_kernel_init, "compiler_generic_pointer.cl", kernelName, SOURCE, "-cl-std=CL2.0");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i)
+ cpu_src[i] = ((T*)buf_data[0])[i] = (T)i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < n; ++i) {
+// printf("i=%d dst=%d\n", i, ((T*)buf_data[1])[i]);
+ OCL_ASSERT(((T*)buf_data[1])[i] == 2 * cpu_src[i]);
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+#define GENERIC_TEST(T) \
+void compiler_generic_pointer_##T() { \
+ test<T>("compiler_generic_pointer_"#T); \
+} \
+MAKE_UTEST_FROM_FUNCTION(compiler_generic_pointer_##T);
+
+GENERIC_TEST(int)
+GENERIC_TEST(char)
+GENERIC_TEST(short)
+GENERIC_TEST(long)
+
+
diff --git a/utests/compiler_pipe_builtin.cpp b/utests/compiler_pipe_builtin.cpp
new file mode 100644
index 0000000..c8ec077
--- /dev/null
+++ b/utests/compiler_pipe_builtin.cpp
@@ -0,0 +1,69 @@
+#include <string.h>
+#include "utest_helper.hpp"
+typedef struct{
+ int a;
+ uint b;
+}mystruct;
+
+#define PIPE_BUILTIN(TYPE,GROUP) \
+static void compiler_pipe_##GROUP##_##TYPE(void) \
+{ \
+ const size_t w = 16; \
+ uint32_t ans_host = 0; \
+ uint32_t ans_device = 0; \
+ /* pipe write kernel*/ \
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_pipe_builtin", "compiler_pipe_"#GROUP"_write_"#TYPE); \
+ OCL_CALL2(clCreatePipe, buf[0], ctx, 0, sizeof(TYPE), w, NULL);\
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, w * sizeof(TYPE), NULL);\
+ OCL_MAP_BUFFER(1);\
+ for (uint32_t i = 0; i < w; i++)\
+ ((uint32_t*)buf_data[1])[i] = i;\
+ OCL_UNMAP_BUFFER(1);\
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);\
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);\
+ globals[0] = w;\
+ locals[0] = 16;\
+ OCL_NDRANGE(1);\
+ OCL_CALL(clReleaseKernel, kernel);\
+ /* pipe read kernel */\
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_pipe_builtin", "compiler_pipe_"#GROUP"_read_"#TYPE);\
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, w * sizeof(TYPE), NULL);\
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);\
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[2]);\
+ OCL_NDRANGE(1);\
+ /* Check result */\
+ OCL_MAP_BUFFER(2);\
+ for (uint32_t i = 0; i < w; i++) {\
+ ans_device += ((uint32_t*)buf_data[2])[i];\
+ ans_host += i;\
+ }\
+ OCL_UNMAP_BUFFER(2);\
+ OCL_ASSERT(ans_host == ans_device);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_pipe_##GROUP##_##TYPE);
+
+PIPE_BUILTIN(int, convenience)
+PIPE_BUILTIN(mystruct, convenience)
+PIPE_BUILTIN(int, reserve)
+PIPE_BUILTIN(mystruct, reserve)
+PIPE_BUILTIN(int, workgroup)
+PIPE_BUILTIN(mystruct, workgroup)
+
+static void compiler_pipe_query(void) {
+ const size_t w = 32;
+ const size_t sz = 16;
+ /* pipe write kernel */
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_pipe_builtin", "compiler_pipe_query");
+ OCL_CALL2(clCreatePipe, buf[0], ctx, 0, sizeof(uint32_t), w, NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sz * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = sz;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ /*Check result */
+ OCL_MAP_BUFFER(1);
+ OCL_ASSERT(sz == ((uint32_t *)buf_data[1])[0] && w == ((uint32_t *)buf_data[1])[1]);
+ OCL_UNMAP_BUFFER(2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_pipe_query);
diff --git a/utests/compiler_program_global.cpp b/utests/compiler_program_global.cpp
new file mode 100644
index 0000000..ef7c655
--- /dev/null
+++ b/utests/compiler_program_global.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+
+static int init_program(const char* name, cl_context ctx, cl_program *pg )
+{
+ cl_int err;
+ char* ker_path = cl_do_kiss_path(name, device);
+
+ cl_file_map_t *fm = cl_file_map_new();
+ err = cl_file_map_open(fm, ker_path);
+ if(err != CL_FILE_MAP_SUCCESS)
+ OCL_ASSERT(0);
+ const char *src = cl_file_map_begin(fm);
+
+ *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
+ free(ker_path);
+ cl_file_map_delete(fm);
+ return 0;
+
+}
+
+void compiler_program_global()
+{
+ const int n = 16;
+ int cpu_src[16];
+ cl_int err;
+
+ // Setup kernel and buffers
+ cl_program program;
+ init_program("compiler_program_global.cl", ctx, &program);
+ OCL_CALL (clBuildProgram, program, 1, &device, "-cl-std=CL2.0", NULL, NULL);
+
+ cl_kernel k0 = clCreateKernel(program, "compiler_program_global0", &err);
+ assert(err == CL_SUCCESS);
+ cl_kernel k1 = clCreateKernel(program, "compiler_program_global1", &err);
+ assert(err == CL_SUCCESS);
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+
+ OCL_CALL (clSetKernelArg, k0, 0, sizeof(cl_mem), &buf[0]);
+ OCL_CALL (clSetKernelArg, k1, 0, sizeof(cl_mem), &buf[1]);
+
+ int dynamic = 1;
+ OCL_CALL (clSetKernelArg, k0, 1, sizeof(cl_int), &dynamic);
+ OCL_CALL (clSetKernelArg, k1, 1, sizeof(cl_int), &dynamic);
+
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i)
+ cpu_src[i] = ((int*)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_CALL (clEnqueueNDRangeKernel, queue, k0, 1, NULL, globals, locals, 0, NULL, NULL);
+ OCL_CALL (clEnqueueNDRangeKernel, queue, k1, 1, NULL, globals, locals, 0, NULL, NULL);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < n; ++i) {
+// printf("i=%d dst=%d\n", i, ((int*)buf_data[1])[i]);
+ switch(i) {
+ default: OCL_ASSERT(((int*)buf_data[1])[i] == i); break;
+ case 11: OCL_ASSERT(((int*)buf_data[1])[i] == 7); break;
+ case 12: OCL_ASSERT(((int*)buf_data[1])[i] == 4); break;
+ case 13: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+ case 14: OCL_ASSERT(((int*)buf_data[1])[i] == 3); break;
+ case 15: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ clReleaseKernel(k0);
+ clReleaseKernel(k1);
+ clReleaseProgram(program);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_program_global);
+
diff --git a/utests/compiler_sampler.cpp b/utests/compiler_sampler.cpp
index 32bf926..f8bf622 100644
--- a/utests/compiler_sampler.cpp
+++ b/utests/compiler_sampler.cpp
@@ -8,7 +8,7 @@ void compiler_sampler(void)
OCL_ASSERT(ctx != 0);
cl_sampler s;
cl_int err;
- int a1[] = {CL_TRUE, CL_FALSE},
+ cl_uint a1[] = {CL_TRUE, CL_FALSE},
a2[] = {CL_ADDRESS_MIRRORED_REPEAT,
CL_ADDRESS_REPEAT,
CL_ADDRESS_CLAMP_TO_EDGE,
@@ -33,6 +33,18 @@ void compiler_sampler(void)
for(l=0; l<5; l++)
OCL_CALL(clGetSamplerInfo, s, a4[l], 1000, pv, &pv_size);
OCL_CALL(clReleaseSampler, s);
+ cl_sampler_properties sam[] = {
+ CL_SAMPLER_NORMALIZED_COORDS, a1[i],
+ CL_SAMPLER_ADDRESSING_MODE, a2[j],
+ CL_SAMPLER_FILTER_MODE, a3[k],
+ 0};
+ s = clCreateSamplerWithProperties(ctx, sam, &err);
+ OCL_ASSERT(err == CL_SUCCESS);
+ OCL_CALL(clRetainSampler, s);
+ OCL_CALL(clReleaseSampler, s);
+ for(l=0; l<5; l++)
+ OCL_CALL(clGetSamplerInfo, s, a4[l], 1000, pv, &pv_size);
+ OCL_CALL(clReleaseSampler, s);
}
}
diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle.cpp
index f33e9de..2aadfed 100644
--- a/utests/compiler_sub_group_shuffle.cpp
+++ b/utests/compiler_sub_group_shuffle.cpp
@@ -1,6 +1,6 @@
#include "utest_helper.hpp"
-void compiler_sub_group_shuffle(void)
+void compiler_sub_group_shuffle_int(void)
{
if(!cl_check_subgroups())
return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle(void)
const int32_t buf_size = 4 * n + 1;
// Setup kernel and buffers
- OCL_CREATE_KERNEL("compiler_sub_group_shuffle");
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle",
+ "compiler_sub_group_shuffle_int");
OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -43,5 +44,50 @@ void compiler_sub_group_shuffle(void)
}
OCL_UNMAP_BUFFER(0);
}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_int);
+void compiler_sub_group_shuffle_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ const size_t n = 32;
+ const int32_t buf_size = 4 * n + 1;
+
+ // Setup kernel and buffers
+ OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle.cl",
+ "compiler_sub_group_shuffle_short",
+ SOURCE, "-DSHORT");
+ OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ int c = 3;
+ OCL_SET_ARG(1, sizeof(int), &c);
+
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < buf_size; ++i)
+ ((short*)buf_data[0])[i] = -1;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle);
+ // Compare
+ OCL_MAP_BUFFER(0);
+ short* dst = (short*)buf_data[0];
+ int suggroupsize = dst[0];
+ OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+ dst++;
+ for (int32_t i = 0; i < (int32_t) n; ++i){
+ int round = i / suggroupsize;
+ int index = i % suggroupsize;
+ OCL_ASSERT(index == dst[4*i]);
+ OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]);
+ OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]);
+ OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == dst[4*i+3]);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_short);
diff --git a/utests/compiler_sub_group_shuffle_down.cpp b/utests/compiler_sub_group_shuffle_down.cpp
index 8b23234..13f6e12 100644
--- a/utests/compiler_sub_group_shuffle_down.cpp
+++ b/utests/compiler_sub_group_shuffle_down.cpp
@@ -1,6 +1,6 @@
#include "utest_helper.hpp"
-void compiler_sub_group_shuffle_down(void)
+void compiler_sub_group_shuffle_down_int(void)
{
if(!cl_check_subgroups())
return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_down(void)
const int32_t buf_size = 4 * n + 1;
// Setup kernel and buffers
- OCL_CREATE_KERNEL("compiler_sub_group_shuffle_down");
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_down",
+ "compiler_sub_group_shuffle_down_int");
OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_down(void)
}
OCL_UNMAP_BUFFER(0);
}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down_int);
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down);
+void compiler_sub_group_shuffle_down_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ const size_t n = 32;
+ const int32_t buf_size = 4 * n + 1;
+
+ // Setup kernel and buffers
+ OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_down.cl",
+ "compiler_sub_group_shuffle_down_short",
+ SOURCE, "-DSHORT");
+ OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ int c = 13;
+ OCL_SET_ARG(1, sizeof(int), &c);
+
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < buf_size; ++i)
+ ((short*)buf_data[0])[i] = -1;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(0);
+ short* dst = (short *)buf_data[0];
+ short suggroupsize = dst[0];
+ OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+ dst++;
+ for (int32_t i = 0; i < (int32_t) n; ++i){
+ int round = i / suggroupsize;
+ int index = i % suggroupsize;
+ //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+ OCL_ASSERT( (index + c >= suggroupsize ? 456 : 123) == dst[4*i]);
+ OCL_ASSERT( (index + c >= suggroupsize ? (round * suggroupsize + (i + c) % suggroupsize): 123) == dst[4*i+1]);
+ OCL_ASSERT( (index + index + 1 >= suggroupsize ? -(round * suggroupsize + (i + index + 1) % suggroupsize) : (round * suggroupsize + (i + index + 1) % suggroupsize)) == dst[4*i+2]);
+ OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down_short);
diff --git a/utests/compiler_sub_group_shuffle_up.cpp b/utests/compiler_sub_group_shuffle_up.cpp
index d2e054b..f79f03c 100644
--- a/utests/compiler_sub_group_shuffle_up.cpp
+++ b/utests/compiler_sub_group_shuffle_up.cpp
@@ -1,6 +1,6 @@
#include "utest_helper.hpp"
-void compiler_sub_group_shuffle_up(void)
+void compiler_sub_group_shuffle_up_int(void)
{
if(!cl_check_subgroups())
return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_up(void)
const int32_t buf_size = 4 * n + 1;
// Setup kernel and buffers
- OCL_CREATE_KERNEL("compiler_sub_group_shuffle_up");
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_up",
+ "compiler_sub_group_shuffle_up_int");
OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_up(void)
}
OCL_UNMAP_BUFFER(0);
}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up_int);
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up);
+void compiler_sub_group_shuffle_up_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ const size_t n = 32;
+ const int32_t buf_size = 4 * n + 1;
+
+ // Setup kernel and buffers
+ OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_up.cl",
+ "compiler_sub_group_shuffle_up_short",
+ SOURCE, "-DSHORT");
+ OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ int c = 3;
+ OCL_SET_ARG(1, sizeof(int), &c);
+
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < buf_size; ++i)
+ ((short*)buf_data[0])[i] = -1;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(0);
+ short* dst = (short *)buf_data[0];
+ short suggroupsize = dst[0];
+ OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+ dst++;
+ for (int32_t i = 0; i < (int32_t) n; ++i){
+ int round = i / suggroupsize;
+ int index = i % suggroupsize;
+ //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+ OCL_ASSERT( ((c - index) > 0 ? 123 : 456) == dst[4*i]);
+ OCL_ASSERT( ((c - index) > 0 ? 123 : (i - c)) == dst[4*i+1]);
+ OCL_ASSERT( ((suggroupsize - index - 1 - index) > 0 ? (i + index + 1) : -(i + index + 1 - suggroupsize)) == dst[4*i+2]);
+ OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up_short);
diff --git a/utests/compiler_sub_group_shuffle_xor.cpp b/utests/compiler_sub_group_shuffle_xor.cpp
index 967ec3e..b0ad3ee 100644
--- a/utests/compiler_sub_group_shuffle_xor.cpp
+++ b/utests/compiler_sub_group_shuffle_xor.cpp
@@ -1,6 +1,6 @@
#include "utest_helper.hpp"
-void compiler_sub_group_shuffle_xor(void)
+void compiler_sub_group_shuffle_xor_int(void)
{
if(!cl_check_subgroups())
return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_xor(void)
const int32_t buf_size = 4 * n + 1;
// Setup kernel and buffers
- OCL_CREATE_KERNEL("compiler_sub_group_shuffle_xor");
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_xor",
+ "compiler_sub_group_shuffle_xor_int");
OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_xor(void)
}
OCL_UNMAP_BUFFER(0);
}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor_int);
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor);
+void compiler_sub_group_shuffle_xor_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ const size_t n = 32;
+ const int32_t buf_size = 4 * n + 1;
+
+ // Setup kernel and buffers
+ OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_xor.cl",
+ "compiler_sub_group_shuffle_xor_short",
+ SOURCE, "-DSHORT");
+ OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ int c = 3;
+ OCL_SET_ARG(1, sizeof(int), &c);
+
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < buf_size; ++i)
+ ((short*)buf_data[0])[i] = -1;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(0);
+ short* dst = (short *)buf_data[0];
+ short suggroupsize = dst[0];
+ OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+ dst++;
+ for (int32_t i = 0; i < (int32_t) n; ++i){
+ int round = i / suggroupsize;
+ int index = i % suggroupsize;
+ OCL_ASSERT(index == dst[4*i]);
+ //printf("%d %d %d %d\n", i, dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+ OCL_ASSERT((round * suggroupsize + (c ^ index)) == dst[4*i+1]);
+ OCL_ASSERT((round * suggroupsize + (index ^ (suggroupsize - index -1))) == dst[4*i+2]);
+ OCL_ASSERT((round * suggroupsize + (index ^ (index + 1) % suggroupsize)) == dst[4*i+3]);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor_short);
diff --git a/utests/compiler_subgroup_broadcast.cpp b/utests/compiler_subgroup_broadcast.cpp
index 2e3fabb..33ec43c 100644
--- a/utests/compiler_subgroup_broadcast.cpp
+++ b/utests/compiler_subgroup_broadcast.cpp
@@ -59,10 +59,15 @@ static void generate_data(T* &input,
/* initially 0, augment after */
input[gid + lid] = 0;
- /* check all data types, test ideal for QWORD types */
- input[gid + lid] += ((rand() % 2 - 1) * base_val);
- /* add trailing random bits, tests GENERAL cases */
- input[gid + lid] += (rand() % 112);
+ if(sizeof(T) == 2) {
+ input[gid + lid] = __float_to_half(as_uint((float)(gid + lid)));
+ }
+ else {
+ /* check all data types, test ideal for QWORD types */
+ input[gid + lid] += ((rand() % 2 - 1) * base_val);
+ /* add trailing random bits, tests GENERAL cases */
+ input[gid + lid] += (rand() % 112);
+ }
#if DEBUG_STDOUT
/* output generated input */
@@ -185,3 +190,28 @@ void compiler_subgroup_broadcast_long(void)
subgroup_generic(input, expected);
}
MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_broadcast_long);
+void compiler_subgroup_broadcast_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+ "compiler_subgroup_broadcast_short");
+ subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_short);
+void compiler_subgroup_broadcast_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_broadcast.cl",
+ "compiler_subgroup_broadcast_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_half);
diff --git a/utests/compiler_subgroup_buffer_block_read.cpp b/utests/compiler_subgroup_buffer_block_read.cpp
index 9707f19..74bc899 100644
--- a/utests/compiler_subgroup_buffer_block_read.cpp
+++ b/utests/compiler_subgroup_buffer_block_read.cpp
@@ -64,6 +64,7 @@ static void generate_data(T* &input,
input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
/* add trailing random bits, tests GENERAL cases */
input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+ //input[(gid + lid)*VEC_SIZE + vsz] = (gid + lid)*VEC_SIZE + vsz;
#if DEBUG_STDOUT
/* output generated input */
@@ -156,47 +157,95 @@ static void subgroup_generic(T* input,
/*
* subgroup buffer block read
*/
-void compiler_subgroup_buffer_block_read1(void)
+void compiler_subgroup_buffer_block_read_ui1(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
- "compiler_subgroup_buffer_block_read1");
+ "compiler_subgroup_buffer_block_read_ui1");
subgroup_generic(input, expected, 1);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read1);
-void compiler_subgroup_buffer_block_read2(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_ui1);
+void compiler_subgroup_buffer_block_read_ui2(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
- "compiler_subgroup_buffer_block_read2");
+ "compiler_subgroup_buffer_block_read_ui2");
subgroup_generic(input, expected, 2);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read2);
-void compiler_subgroup_buffer_block_read4(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_ui2);
+void compiler_subgroup_buffer_block_read_ui4(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
- "compiler_subgroup_buffer_block_read4");
+ "compiler_subgroup_buffer_block_read_ui4");
subgroup_generic(input, expected, 4);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read4);
-void compiler_subgroup_buffer_block_read8(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_ui4);
+void compiler_subgroup_buffer_block_read_ui8(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
- "compiler_subgroup_buffer_block_read8");
+ "compiler_subgroup_buffer_block_read_ui8");
subgroup_generic(input, expected, 8);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read8);
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_ui8);
+void compiler_subgroup_buffer_block_read_us1(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+ "compiler_subgroup_buffer_block_read_us1",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_us1);
+void compiler_subgroup_buffer_block_read_us2(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+ "compiler_subgroup_buffer_block_read_us2",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_us2);
+void compiler_subgroup_buffer_block_read_us4(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+ "compiler_subgroup_buffer_block_read_us4",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_us4);
+void compiler_subgroup_buffer_block_read_us8(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+ "compiler_subgroup_buffer_block_read_us8",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read_us8);
diff --git a/utests/compiler_subgroup_buffer_block_write.cpp b/utests/compiler_subgroup_buffer_block_write.cpp
index 6b257c5..3b538da 100644
--- a/utests/compiler_subgroup_buffer_block_write.cpp
+++ b/utests/compiler_subgroup_buffer_block_write.cpp
@@ -64,6 +64,7 @@ static void generate_data(T* &input,
input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
/* add trailing random bits, tests GENERAL cases */
input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+ //input[(gid + lid)*VEC_SIZE + vsz] = (gid + lid)*VEC_SIZE + vsz;
#if DEBUG_STDOUT
/* output generated input */
@@ -156,47 +157,96 @@ static void subgroup_generic(T* input,
/*
* subgroup buffer block write
*/
-void compiler_subgroup_buffer_block_write1(void)
+void compiler_subgroup_buffer_block_write_ui1(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
- "compiler_subgroup_buffer_block_write1");
+ "compiler_subgroup_buffer_block_write_ui1");
subgroup_generic(input, expected, 1);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write1);
-void compiler_subgroup_buffer_block_write2(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_ui1);
+void compiler_subgroup_buffer_block_write_ui2(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
- "compiler_subgroup_buffer_block_write2");
+ "compiler_subgroup_buffer_block_write_ui2");
subgroup_generic(input, expected, 2);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write2);
-void compiler_subgroup_buffer_block_write4(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_ui2);
+void compiler_subgroup_buffer_block_write_ui4(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
- "compiler_subgroup_buffer_block_write4");
+ "compiler_subgroup_buffer_block_write_ui4");
subgroup_generic(input, expected, 4);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write4);
-void compiler_subgroup_buffer_block_write8(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_ui4);
+void compiler_subgroup_buffer_block_write_ui8(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
- "compiler_subgroup_buffer_block_write8");
+ "compiler_subgroup_buffer_block_write_ui8");
subgroup_generic(input, expected, 8);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write8);
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_ui8);
+
+void compiler_subgroup_buffer_block_write_us1(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+ "compiler_subgroup_buffer_block_write_us1",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_us1);
+void compiler_subgroup_buffer_block_write_us2(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+ "compiler_subgroup_buffer_block_write_us2",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_us2);
+void compiler_subgroup_buffer_block_write_us4(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+ "compiler_subgroup_buffer_block_write_us4",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_us4);
+void compiler_subgroup_buffer_block_write_us8(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+ "compiler_subgroup_buffer_block_write_us8",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write_us8);
diff --git a/utests/compiler_subgroup_image_block_read.cpp b/utests/compiler_subgroup_image_block_read.cpp
index 02c8f07..879d622 100644
--- a/utests/compiler_subgroup_image_block_read.cpp
+++ b/utests/compiler_subgroup_image_block_read.cpp
@@ -21,7 +21,7 @@ static void compute_expected(T* input,
{
for(uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
for(uint32_t j = 0; j < VEC_SIZE; j++)
- expected[i * VEC_SIZE + j] = input[WG_GLOBAL_SIZE * j + i];
+ expected[i * VEC_SIZE + j] = input[WG_GLOBAL_SIZE * 4 / sizeof(T) * j + i];
}
/*
@@ -33,7 +33,8 @@ static void generate_data(T* &input,
size_t VEC_SIZE)
{
/* allocate input and expected arrays */
- input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+ int* input_ui = new int[WG_GLOBAL_SIZE * VEC_SIZE];
+ input = (T*)input_ui;
expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
/* base value for all data types */
@@ -45,19 +46,22 @@ static void generate_data(T* &input,
#if DEBUG_STDOUT
cout << endl << "IN: " << endl;
#endif
+ uint32_t rpitch = sizeof(uint32_t) * WG_GLOBAL_SIZE / sizeof(T);
/* generate inputs and expected values */
- for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE * VEC_SIZE; gid++)
- {
- /* initially 0, augment after */
- input[gid] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+ for(uint32_t h = 0; h < VEC_SIZE; ++h) {
+ for(uint32_t w = 0; w < WG_GLOBAL_SIZE; ++w)
+ {
+ /* initially 0, augment after */
+ input[w + h * rpitch] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+ //input[w + h * rpitch] = w + h * WG_GLOBAL_SIZE;
#if DEBUG_STDOUT
- /* output generated input */
- cout << setw(4) << input[gid] << ", " ;
- if((gid + 1) % 8 == 0)
- cout << endl;
+ /* output generated input */
+ cout << setw(4) << input[w + h * rpitch] << ", " ;
+ if((w+ 1) % 8 == 0)
+ cout << endl;
#endif
-
+ }
}
/* expected values */
compute_expected(input, expected, VEC_SIZE);
@@ -151,47 +155,95 @@ static void subgroup_generic(T* input,
/*
* sub_group image block read functions
*/
-void compiler_subgroup_image_block_read1(void)
+void compiler_subgroup_image_block_read_ui1(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
- "compiler_subgroup_image_block_read1");
+ "compiler_subgroup_image_block_read_ui1");
subgroup_generic(input, expected, 1);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read1);
-void compiler_subgroup_image_block_read2(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_ui1);
+void compiler_subgroup_image_block_read_ui2(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
- "compiler_subgroup_image_block_read2");
+ "compiler_subgroup_image_block_read_ui2");
subgroup_generic(input, expected, 2);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read2);
-void compiler_subgroup_image_block_read4(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_ui2);
+void compiler_subgroup_image_block_read_ui4(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
- "compiler_subgroup_image_block_read4");
+ "compiler_subgroup_image_block_read_ui4");
subgroup_generic(input, expected, 4);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read4);
-void compiler_subgroup_image_block_read8(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_ui4);
+void compiler_subgroup_image_block_read_ui8(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
- "compiler_subgroup_image_block_read8");
+ "compiler_subgroup_image_block_read_ui8");
+ subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_ui8);
+void compiler_subgroup_image_block_read_us1(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+ "compiler_subgroup_image_block_read_us1",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_us1);
+void compiler_subgroup_image_block_read_us2(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+ "compiler_subgroup_image_block_read_us2",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_us2);
+void compiler_subgroup_image_block_read_us4(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+ "compiler_subgroup_image_block_read_us4",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_us4);
+void compiler_subgroup_image_block_read_us8(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+ "compiler_subgroup_image_block_read_us8",
+ SOURCE, "-DSHORT");
subgroup_generic(input, expected, 8);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read8);
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read_us8);
diff --git a/utests/compiler_subgroup_image_block_write.cpp b/utests/compiler_subgroup_image_block_write.cpp
index 2b85167..98cbb0f 100644
--- a/utests/compiler_subgroup_image_block_write.cpp
+++ b/utests/compiler_subgroup_image_block_write.cpp
@@ -50,6 +50,7 @@ static void generate_data(T* &input,
{
/* initially 0, augment after */
input[gid] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+ //input[gid] = gid;
#if DEBUG_STDOUT
/* output generated input */
@@ -155,47 +156,95 @@ static void subgroup_generic(T* input,
/*
* sub_group image block write functions
*/
-void compiler_subgroup_image_block_write1(void)
+void compiler_subgroup_image_block_write_ui1(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
- "compiler_subgroup_image_block_write1");
+ "compiler_subgroup_image_block_write_ui1");
subgroup_generic(input, expected, 1);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write1);
-void compiler_subgroup_image_block_write2(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_ui1);
+void compiler_subgroup_image_block_write_ui2(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
- "compiler_subgroup_image_block_write2");
+ "compiler_subgroup_image_block_write_ui2");
subgroup_generic(input, expected, 2);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write2);
-void compiler_subgroup_image_block_write4(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_ui2);
+void compiler_subgroup_image_block_write_ui4(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
- "compiler_subgroup_image_block_write4");
+ "compiler_subgroup_image_block_write_ui4");
subgroup_generic(input, expected, 4);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write4);
-void compiler_subgroup_image_block_write8(void)
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_ui4);
+void compiler_subgroup_image_block_write_ui8(void)
{
if(!cl_check_subgroups())
return;
cl_uint *input = NULL;
cl_uint *expected = NULL;
OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
- "compiler_subgroup_image_block_write8");
+ "compiler_subgroup_image_block_write_ui8");
subgroup_generic(input, expected, 8);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write8);
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_ui8);
+void compiler_subgroup_image_block_write_us1(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+ "compiler_subgroup_image_block_write_us1",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_us1);
+void compiler_subgroup_image_block_write_us2(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+ "compiler_subgroup_image_block_write_us2",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_us2);
+void compiler_subgroup_image_block_write_us4(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+ "compiler_subgroup_image_block_write_us4",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_us4);
+void compiler_subgroup_image_block_write_us8(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+ "compiler_subgroup_image_block_write_us8",
+ SOURCE, "-DSHORT");
+ subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write_us8);
diff --git a/utests/compiler_subgroup_reduce.cpp b/utests/compiler_subgroup_reduce.cpp
index 3c3df06..157086a 100644
--- a/utests/compiler_subgroup_reduce.cpp
+++ b/utests/compiler_subgroup_reduce.cpp
@@ -33,7 +33,8 @@ template<class T>
static void compute_expected(WG_FUNCTION wg_func,
T* input,
T* expected,
- size_t SIMD_SIZE)
+ size_t SIMD_SIZE,
+ bool IS_HALF)
{
if(wg_func == WG_ANY)
{
@@ -54,24 +55,43 @@ static void compute_expected(WG_FUNCTION wg_func,
else if(wg_func == WG_REDUCE_ADD)
{
T wg_sum = input[0];
- for(uint32_t i = 1; i < SIMD_SIZE; i++)
- wg_sum += input[i];
+ if(IS_HALF) {
+ float wg_sum_tmp = 0.0f;
+ for(uint32_t i = 0; i < SIMD_SIZE; i++) {
+ wg_sum_tmp += as_float(__half_to_float(input[i]));
+ }
+ wg_sum = __float_to_half(as_uint(wg_sum_tmp));
+ }
+ else {
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ wg_sum += input[i];
+ }
for(uint32_t i = 0; i < SIMD_SIZE; i++)
expected[i] = wg_sum;
}
else if(wg_func == WG_REDUCE_MAX)
{
T wg_max = input[0];
- for(uint32_t i = 1; i < SIMD_SIZE; i++)
- wg_max = max(input[i], wg_max);
+ for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+ if (IS_HALF) {
+ wg_max = (as_float(__half_to_float(input[i])) > as_float(__half_to_float(wg_max))) ? input[i] : wg_max;
+ }
+ else
+ wg_max = max(input[i], wg_max);
+ }
for(uint32_t i = 0; i < SIMD_SIZE; i++)
expected[i] = wg_max;
}
else if(wg_func == WG_REDUCE_MIN)
{
T wg_min = input[0];
- for(uint32_t i = 1; i < SIMD_SIZE; i++)
- wg_min = min(input[i], wg_min);
+ for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+ if (IS_HALF) {
+ wg_min= (as_float(__half_to_float(input[i])) < as_float(__half_to_float(wg_min))) ? input[i] : wg_min;
+ }
+ else
+ wg_min = min(input[i], wg_min);
+ }
for(uint32_t i = 0; i < SIMD_SIZE; i++)
expected[i] = wg_min;
}
@@ -85,7 +105,8 @@ template<class T>
static void generate_data(WG_FUNCTION wg_func,
T* &input,
T* &expected,
- size_t SIMD_SIZE)
+ size_t SIMD_SIZE,
+ bool IS_HALF)
{
input = new T[WG_GLOBAL_SIZE];
expected = new T[WG_GLOBAL_SIZE];
@@ -115,6 +136,8 @@ static void generate_data(WG_FUNCTION wg_func,
/* add trailing random bits, tests GENERAL cases */
input[gid + lid] += (rand() % 112);
/* always last bit is 1, ideal test ALL/ANY */
+ if (IS_HALF)
+ input[gid + lid] = __float_to_half(as_uint((float)input[gid + lid]/2));
} else {
input[gid + lid] += rand();
input[gid + lid] += rand() / ((float)RAND_MAX + 1);
@@ -129,7 +152,7 @@ static void generate_data(WG_FUNCTION wg_func,
}
/* expected values */
- compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+ compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE, IS_HALF);
#if DEBUG_STDOUT
/* output expected input */
@@ -152,7 +175,8 @@ static void generate_data(WG_FUNCTION wg_func,
template<class T>
static void subgroup_generic(WG_FUNCTION wg_func,
T* input,
- T* expected)
+ T* expected,
+ bool IS_HALF = false)
{
/* get simd size */
globals[0] = WG_GLOBAL_SIZE;
@@ -161,7 +185,7 @@ static void subgroup_generic(WG_FUNCTION wg_func,
OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
/* input and expected data */
- generate_data(wg_func, input, expected, SIMD_SIZE);
+ generate_data(wg_func, input, expected, SIMD_SIZE, IS_HALF);
/* prepare input for data type */
OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
@@ -185,8 +209,22 @@ static void subgroup_generic(WG_FUNCTION wg_func,
for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
if(((T *)buf_data[1])[i] != *(expected + i))
{
+ if (IS_HALF) {
+ float num_computed = as_float(__half_to_float(((T *)buf_data[1])[i]));
+ float num_expected = as_float(__half_to_float(*(expected + i)));
+ float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+ if (num_diff > 0.03f) {
+ mismatches++;
+ }
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " << num_computed
+ << " != " << num_expected << " diff: " <<num_diff <<endl;
+#endif
+ //}
+ }
/* found mismatch on integer, increment */
- if (numeric_limits<T>::is_integer) {
+ else if (numeric_limits<T>::is_integer) {
mismatches++;
#if DEBUG_STDOUT
@@ -305,6 +343,42 @@ void compiler_subgroup_reduce_add_float(void)
subgroup_generic(WG_REDUCE_ADD, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_float);
+void compiler_subgroup_reduce_add_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_reduce.cl",
+ "compiler_subgroup_reduce_add_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_REDUCE_ADD, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_half);
+void compiler_subgroup_reduce_add_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_add_short");
+ subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_short);
+void compiler_subgroup_reduce_add_ushort(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_add_ushort");
+ subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_ushort);
/*
* Workgroup reduce max utest functions
@@ -364,6 +438,42 @@ void compiler_subgroup_reduce_max_float(void)
subgroup_generic(WG_REDUCE_MAX, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_float);
+void compiler_subgroup_reduce_max_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_reduce.cl",
+ "compiler_subgroup_reduce_max_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_REDUCE_MAX, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_half);
+void compiler_subgroup_reduce_max_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_max_short");
+ subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_short);
+void compiler_subgroup_reduce_max_ushort(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_max_ushort");
+ subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_ushort);
/*
* Workgroup reduce min utest functions
@@ -423,3 +533,39 @@ void compiler_subgroup_reduce_min_float(void)
subgroup_generic(WG_REDUCE_MIN, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_float);
+void compiler_subgroup_reduce_min_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_reduce.cl",
+ "compiler_subgroup_reduce_min_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_REDUCE_MIN, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_half);
+void compiler_subgroup_reduce_min_short(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_min_short");
+ subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_short);
+void compiler_subgroup_reduce_min_ushort(void)
+{
+ if(!cl_check_subgroups_short())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_min_ushort");
+ subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_ushort);
diff --git a/utests/compiler_subgroup_scan_exclusive.cpp b/utests/compiler_subgroup_scan_exclusive.cpp
index 1a21b59..4f3e5ea 100644
--- a/utests/compiler_subgroup_scan_exclusive.cpp
+++ b/utests/compiler_subgroup_scan_exclusive.cpp
@@ -32,36 +32,56 @@ template<class T>
static void compute_expected(WG_FUNCTION wg_func,
T* input,
T* expected,
- size_t SIMD_SIZE)
+ size_t SIMD_SIZE,
+ bool IS_HALF)
{
if(wg_func == WG_SCAN_EXCLUSIVE_ADD)
{
expected[0] = 0;
expected[1] = input[0];
- for(uint32_t i = 2; i < SIMD_SIZE; i++)
- expected[i] = input[i - 1] + expected[i - 1];
+ for(uint32_t i = 2; i < SIMD_SIZE; i++) {
+ if (IS_HALF)
+ expected[i] = __float_to_half(as_uint(as_float(__half_to_float(input[i - 1])) +
+ as_float(__half_to_float(expected[i - 1]))));
+ else
+ expected[i] = input[i - 1] + expected[i - 1];
+ }
}
else if(wg_func == WG_SCAN_EXCLUSIVE_MAX)
{
- if(numeric_limits<T>::is_integer)
+ if(IS_HALF)
+ expected[0] = 0xFC00;
+ else if(numeric_limits<T>::is_integer)
expected[0] = numeric_limits<T>::min();
else
expected[0] = - numeric_limits<T>::infinity();
expected[1] = input[0];
- for(uint32_t i = 2; i < SIMD_SIZE; i++)
- expected[i] = max(input[i - 1], expected[i - 1]);
+ for(uint32_t i = 2; i < SIMD_SIZE; i++) {
+ if (IS_HALF)
+ expected[i] = (as_float(__half_to_float(input[i - 1])) > as_float(__half_to_float(expected[i - 1]))) ?
+ input[i - 1] : expected[i - 1];
+ else
+ expected[i] = max(input[i - 1], expected[i - 1]);
+ }
}
else if(wg_func == WG_SCAN_EXCLUSIVE_MIN)
{
- if(numeric_limits<T>::is_integer)
+ if(IS_HALF)
+ expected[0] = 0x7C00;
+ else if(numeric_limits<T>::is_integer)
expected[0] = numeric_limits<T>::max();
else
expected[0] = numeric_limits<T>::infinity();
expected[1] = input[0];
- for(uint32_t i = 2; i < SIMD_SIZE; i++)
- expected[i] = min(input[i - 1], expected[i - 1]);
+ for(uint32_t i = 2; i < SIMD_SIZE; i++) {
+ if (IS_HALF)
+ expected[i] = (as_float(__half_to_float(input[i - 1])) < as_float(__half_to_float(expected[i - 1]))) ?
+ input[i - 1] : expected[i - 1];
+ else
+ expected[i] = min(input[i - 1], expected[i - 1]);
+ }
}
}
@@ -73,7 +93,8 @@ template<class T>
static void generate_data(WG_FUNCTION wg_func,
T* &input,
T* &expected,
- size_t SIMD_SIZE)
+ size_t SIMD_SIZE,
+ bool IS_HALF)
{
input = new T[WG_GLOBAL_SIZE];
expected = new T[WG_GLOBAL_SIZE];
@@ -101,6 +122,8 @@ static void generate_data(WG_FUNCTION wg_func,
input[gid + lid] += ((rand() % 2 - 1) * base_val);
/* add trailing random bits, tests GENERAL cases */
input[gid + lid] += (rand() % 112);
+ if (IS_HALF)
+ input[gid + lid] = __float_to_half(as_uint((float)input[gid + lid]/2));
#if DEBUG_STDOUT
/* output generated input */
@@ -111,7 +134,7 @@ static void generate_data(WG_FUNCTION wg_func,
}
/* expected values */
- compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+ compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE, IS_HALF);
#if DEBUG_STDOUT
/* output expected input */
@@ -134,7 +157,8 @@ static void generate_data(WG_FUNCTION wg_func,
template<class T>
static void subgroup_generic(WG_FUNCTION wg_func,
T* input,
- T* expected)
+ T* expected,
+ bool IS_HALF = false)
{
/* get simd size */
globals[0] = WG_GLOBAL_SIZE;
@@ -143,7 +167,7 @@ static void subgroup_generic(WG_FUNCTION wg_func,
OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
/* input and expected data */
- generate_data(wg_func, input, expected, SIMD_SIZE);
+ generate_data(wg_func, input, expected, SIMD_SIZE, IS_HALF);
/* prepare input for data type */
OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
@@ -166,8 +190,21 @@ static void subgroup_generic(WG_FUNCTION wg_func,
for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
if(((T *)buf_data[1])[i] != *(expected + i))
{
+ if (IS_HALF) {
+ float num_computed = as_float(__half_to_float(((T *)buf_data[1])[i]));
+ float num_expected = as_float(__half_to_float(*(expected + i)));
+ float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+ if (num_diff > 0.03f) {
+ mismatches++;
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " << num_computed
+ << " != " << num_expected <<" diff: " <<num_diff <<endl;
+#endif
+ }
+ }
/* found mismatch on integer, increment */
- if(numeric_limits<T>::is_integer){
+ else if (numeric_limits<T>::is_integer) {
mismatches++;
#if DEBUG_STDOUT
@@ -261,6 +298,42 @@ void compiler_subgroup_scan_exclusive_add_float(void)
subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_float);
+void compiler_subgroup_scan_exclusive_add_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_exclusive.cl",
+ "compiler_subgroup_scan_exclusive_add_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_half);
+void compiler_subgroup_scan_exclusive_add_short(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_add_short");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_short);
+void compiler_subgroup_scan_exclusive_add_ushort(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_add_ushort");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_ushort);
/*
* Workgroup scan_exclusive max utest functions
@@ -320,6 +393,42 @@ void compiler_subgroup_scan_exclusive_max_float(void)
subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_float);
+void compiler_subgroup_scan_exclusive_max_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_exclusive.cl",
+ "compiler_subgroup_scan_exclusive_max_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_half);
+void compiler_subgroup_scan_exclusive_max_short(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_max_short");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_short);
+void compiler_subgroup_scan_exclusive_max_ushort(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_max_ushort");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_ushort);
/*
* Workgroup scan_exclusive min utest functions
@@ -379,3 +488,39 @@ void compiler_subgroup_scan_exclusive_min_float(void)
subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_float);
+void compiler_subgroup_scan_exclusive_min_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_exclusive.cl",
+ "compiler_subgroup_scan_exclusive_min_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_half);
+void compiler_subgroup_scan_exclusive_min_short(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_min_short");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_short);
+void compiler_subgroup_scan_exclusive_min_ushort(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_min_ushort");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_ushort);
diff --git a/utests/compiler_subgroup_scan_inclusive.cpp b/utests/compiler_subgroup_scan_inclusive.cpp
index fa32855..8f8c264 100644
--- a/utests/compiler_subgroup_scan_inclusive.cpp
+++ b/utests/compiler_subgroup_scan_inclusive.cpp
@@ -32,25 +32,41 @@ template<class T>
static void compute_expected(WG_FUNCTION wg_func,
T* input,
T* expected,
- size_t SIMD_SIZE)
+ size_t SIMD_SIZE,
+ bool IS_HALF)
{
if(wg_func == WG_SCAN_INCLUSIVE_ADD)
{
expected[0] = input[0];
- for(uint32_t i = 1; i < SIMD_SIZE; i++)
- expected[i] = input[i] + expected[i - 1];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+ if (IS_HALF)
+ expected[i] = __float_to_half(as_uint(as_float(__half_to_float(input[i])) +
+ as_float(__half_to_float(expected[i - 1]))));
+ else
+ expected[i] = input[i] + expected[i - 1];
+ }
}
else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
{
expected[0] = input[0];
- for(uint32_t i = 1; i < SIMD_SIZE; i++)
- expected[i] = max(input[i], expected[i - 1]);
+ for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+ if (IS_HALF)
+ expected[i] = (as_float(__half_to_float(input[i])) > as_float(__half_to_float(expected[i - 1]))) ?
+ input[i] : expected[i - 1];
+ else
+ expected[i] = max(input[i], expected[i - 1]);
+ }
}
else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
{
expected[0] = input[0];
- for(uint32_t i = 1; i < SIMD_SIZE; i++)
- expected[i] = min(input[i], expected[i - 1]);
+ for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+ if (IS_HALF)
+ expected[i] = (as_float(__half_to_float(input[i])) < as_float(__half_to_float(expected[i - 1]))) ?
+ input[i] : expected[i - 1];
+ else
+ expected[i] = min(input[i], expected[i - 1]);
+ }
}
}
@@ -62,7 +78,8 @@ template<class T>
static void generate_data(WG_FUNCTION wg_func,
T* &input,
T* &expected,
- size_t SIMD_SIZE)
+ size_t SIMD_SIZE,
+ bool IS_HALF)
{
input = new T[WG_GLOBAL_SIZE];
expected = new T[WG_GLOBAL_SIZE];
@@ -91,6 +108,8 @@ static void generate_data(WG_FUNCTION wg_func,
input[gid + lid] += ((rand() % 2 - 1) * base_val);
/* add trailing random bits, tests GENERAL cases */
input[gid + lid] += (rand() % 112);
+ if (IS_HALF)
+ input[gid + lid] = __float_to_half(as_uint((float)input[gid + lid]/2));
#if DEBUG_STDOUT
/* output generated input */
@@ -101,7 +120,7 @@ static void generate_data(WG_FUNCTION wg_func,
}
/* expected values */
- compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+ compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE, IS_HALF);
#if DEBUG_STDOUT
/* output expected input */
@@ -124,7 +143,8 @@ static void generate_data(WG_FUNCTION wg_func,
template<class T>
static void subgroup_generic(WG_FUNCTION wg_func,
T* input,
- T* expected)
+ T* expected,
+ bool IS_HALF = false)
{
/* get simd size */
globals[0] = WG_GLOBAL_SIZE;
@@ -133,7 +153,7 @@ static void subgroup_generic(WG_FUNCTION wg_func,
OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
/* input and expected data */
- generate_data(wg_func, input, expected, SIMD_SIZE);
+ generate_data(wg_func, input, expected, SIMD_SIZE, IS_HALF);
/* prepare input for data type */
OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
@@ -156,8 +176,21 @@ static void subgroup_generic(WG_FUNCTION wg_func,
for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
if(((T *)buf_data[1])[i] != *(expected + i))
{
+ if (IS_HALF) {
+ float num_computed = as_float(__half_to_float(((T *)buf_data[1])[i]));
+ float num_expected = as_float(__half_to_float(*(expected + i)));
+ float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+ if (num_diff > 0.03f) {
+ mismatches++;
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " << num_computed
+ << " != " << num_expected <<" diff: " <<num_diff <<endl;
+#endif
+ }
+ }
/* found mismatch on integer, increment */
- if(numeric_limits<T>::is_integer){
+ else if (numeric_limits<T>::is_integer) {
mismatches++;
#if DEBUG_STDOUT
@@ -251,6 +284,42 @@ void compiler_subgroup_scan_inclusive_add_float(void)
subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_float);
+void compiler_subgroup_scan_inclusive_add_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_inclusive.cl",
+ "compiler_subgroup_scan_inclusive_add_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_half);
+void compiler_subgroup_scan_inclusive_add_short(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_add_short");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_short);
+void compiler_subgroup_scan_inclusive_add_ushort(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_add_ushort");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_ushort);
/*
* Workgroup scan_inclusive max utest functions
@@ -310,6 +379,42 @@ void compiler_subgroup_scan_inclusive_max_float(void)
subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_float);
+void compiler_subgroup_scan_inclusive_max_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_inclusive.cl",
+ "compiler_subgroup_scan_inclusive_max_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_half);
+void compiler_subgroup_scan_inclusive_max_short(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_max_short");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_short);
+void compiler_subgroup_scan_inclusive_max_ushort(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_max_ushort");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_ushort);
/*
* Workgroup scan_inclusive min utest functions
@@ -369,4 +474,39 @@ void compiler_subgroup_scan_inclusive_min_float(void)
subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
}
MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_float);
-
+void compiler_subgroup_scan_inclusive_min_half(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ if(!cl_check_half())
+ return;
+ cl_half *input = NULL;
+ cl_half *expected = NULL;
+ OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_inclusive.cl",
+ "compiler_subgroup_scan_inclusive_min_half",
+ SOURCE, "-DHALF");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected, true);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_half);
+void compiler_subgroup_scan_inclusive_min_short(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_short *input = NULL;
+ cl_short *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_min_short");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_short);
+void compiler_subgroup_scan_inclusive_min_ushort(void)
+{
+ if(!cl_check_subgroups())
+ return;
+ cl_ushort *input = NULL;
+ cl_ushort *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_min_ushort");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_ushort);
diff --git a/utests/load_program_from_spir.cpp b/utests/load_program_from_spir.cpp
index bb53947..767a488 100644
--- a/utests/load_program_from_spir.cpp
+++ b/utests/load_program_from_spir.cpp
@@ -35,7 +35,10 @@ static void test_load_program_from_spir(void)
fprintf(stderr, "run out of memory\n");
return;
}
- ker_path = cl_do_kiss_path("compiler_ceil32.spir", device);
+ if(sizeof(size_t) == 8)
+ ker_path = cl_do_kiss_path("compiler_ceil64.spir", device);
+ else
+ ker_path = cl_do_kiss_path("compiler_ceil32.spir", device);
OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
diff --git a/utests/multi_queue_events.cpp b/utests/multi_queue_events.cpp
new file mode 100644
index 0000000..4545167
--- /dev/null
+++ b/utests/multi_queue_events.cpp
@@ -0,0 +1,129 @@
+#include "utest_helper.hpp"
+
+#define THREAD_SIZE 8
+pthread_t tid[THREAD_SIZE];
+static cl_command_queue all_queues[THREAD_SIZE];
+static cl_event enqueue_events[THREAD_SIZE];
+static cl_event user_event;
+static cl_kernel the_kernel;
+static char source_str[] =
+ "kernel void assgin_work_dim( __global int *ret, int i) { \n"
+ "if (i == 0) ret[i] = 10; \n"
+ "else ret[i] = ret[i - 1] + 1; \n"
+ "}\n";
+static size_t the_globals[3] = {16, 1, 1};
+static size_t the_locals[3] = {16, 1, 1};
+static size_t the_goffsets[3] = {0, 0, 0};
+
+static void *thread_function(void *arg)
+{
+ int num = *((int *)arg);
+ cl_int ret;
+ cl_event dep_event[2];
+
+ ret = clSetKernelArg(the_kernel, 1, sizeof(cl_int), &num);
+ OCL_ASSERT(ret == CL_SUCCESS);
+
+ if (num == 0) {
+ dep_event[0] = user_event;
+ ret = clEnqueueNDRangeKernel(all_queues[num], the_kernel, 1, the_goffsets, the_globals, the_locals,
+ 1, dep_event, &enqueue_events[num]);
+ } else {
+ dep_event[0] = user_event;
+ dep_event[1] = enqueue_events[num - 1];
+ ret = clEnqueueNDRangeKernel(all_queues[num], the_kernel, 1, the_goffsets, the_globals, the_locals,
+ 2, dep_event, &enqueue_events[num]);
+ }
+
+ OCL_ASSERT(ret == CL_SUCCESS);
+ return NULL;
+}
+
+void multi_queue_events(void)
+{
+ cl_int ret;
+ size_t source_size = sizeof(source_str);
+ const char *source = source_str;
+ cl_program program = NULL;
+ int i;
+
+ /* Create Kernel Program from the source */
+ program = clCreateProgramWithSource(ctx, 1, &source, &source_size, &ret);
+ OCL_ASSERT(ret == CL_SUCCESS);
+
+ /* Build Kernel Program */
+ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
+ OCL_ASSERT(ret == CL_SUCCESS);
+
+ the_kernel = clCreateKernel(program, "assgin_work_dim", NULL);
+ OCL_ASSERT(the_kernel != NULL);
+
+ int buffer_content[16] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+ cl_mem buf = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, 16 * 4, buffer_content, &ret);
+ OCL_ASSERT(buf != NULL);
+
+ ret = clSetKernelArg(the_kernel, 0, sizeof(cl_mem), &buf);
+ OCL_ASSERT(ret == CL_SUCCESS);
+
+ for (i = 0; i < THREAD_SIZE; i++) {
+ all_queues[i] = clCreateCommandQueue(ctx, device, 0, &ret);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ }
+
+ user_event = clCreateUserEvent(ctx, &ret);
+ OCL_ASSERT(ret == CL_SUCCESS);
+
+ for (i = 0; i < THREAD_SIZE; i++) {
+ pthread_create(&tid[i], NULL, thread_function, &i);
+ pthread_join(tid[i], NULL);
+ }
+
+ cl_event map_event;
+
+ void *map_ptr = clEnqueueMapBuffer(all_queues[0], buf, 0, CL_MAP_READ, 0, 32,
+ THREAD_SIZE, enqueue_events, &map_event, NULL);
+
+ OCL_ASSERT(map_ptr != NULL);
+
+ cl_event all_event[10];
+ for (i = 0; i < THREAD_SIZE; i++) {
+ all_event[i] = enqueue_events[i];
+ }
+ all_event[8] = user_event;
+ all_event[9] = map_event;
+
+ //printf("before Waitfor events ##\n");
+ clSetUserEventStatus(user_event, CL_COMPLETE);
+ ret = clWaitForEvents(10, all_event);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ //printf("After Waitfor events ##\n");
+
+ //printf("############# Finish Setting ################\n");
+
+ printf("\n");
+ for (i = 0; i < 8; i++) {
+ //printf(" %d", ((int *)map_ptr)[i]);
+ OCL_ASSERT(((int *)map_ptr)[i] == 10 + i);
+ }
+
+ //printf("\n");
+
+ ret = clEnqueueUnmapMemObject(all_queues[0], buf, map_ptr, 1, &map_event, NULL);
+ OCL_ASSERT(ret == CL_SUCCESS);
+
+ //printf("------------------------- End -------------------------------\n");
+
+ clReleaseKernel(the_kernel);
+ clReleaseProgram(program);
+ clReleaseMemObject(buf);
+ for (i = 0; i < THREAD_SIZE; i++) {
+ clReleaseCommandQueue(all_queues[i]);
+ clReleaseEvent(enqueue_events[i]);
+ }
+ clReleaseEvent(user_event);
+ clReleaseEvent(map_event);
+}
+
+MAKE_UTEST_FROM_FUNCTION(multi_queue_events);
diff --git a/utests/runtime_barrier_list.cpp b/utests/runtime_barrier_list.cpp
index 3b8d3c3..7e12b66 100644
--- a/utests/runtime_barrier_list.cpp
+++ b/utests/runtime_barrier_list.cpp
@@ -23,7 +23,7 @@ void runtime_barrier_list(void)
OCL_CREATE_USER_EVENT(ev[0]);
- clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+ clEnqueueWriteBuffer(queue, buf[0], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, sizeof(int), &value);
@@ -40,23 +40,22 @@ void runtime_barrier_list(void)
}
- buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+ buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_FALSE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
clEnqueueBarrierWithWaitList(queue, 0, NULL, &ev[3]);
- clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+ clEnqueueWriteBuffer(queue, buf[1], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
- OCL_FINISH();
clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
OCL_ASSERT(status != CL_COMPLETE);
OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+ OCL_FINISH();
+
clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
OCL_ASSERT(status == CL_COMPLETE);
- OCL_FINISH();
-
for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
OCL_ASSERT(status <= CL_COMPLETE);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
index 00e02f1..5804f95 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_event.cpp
@@ -18,7 +18,7 @@ void runtime_event(void)
OCL_CREATE_USER_EVENT(ev[0]);
- clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+ clEnqueueWriteBuffer(queue, buf[0], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, sizeof(int), &value);
@@ -33,7 +33,7 @@ void runtime_event(void)
OCL_ASSERT(status >= CL_SUBMITTED);
}
- buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+ buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_FALSE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
diff --git a/utests/runtime_marker_list.cpp b/utests/runtime_marker_list.cpp
index 751f4a0..b4e6edb 100644
--- a/utests/runtime_marker_list.cpp
+++ b/utests/runtime_marker_list.cpp
@@ -23,7 +23,7 @@ void runtime_marker_list(void)
OCL_CREATE_USER_EVENT(ev[0]);
- clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+ clEnqueueWriteBuffer(queue, buf[0], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, sizeof(int), &value);
@@ -40,23 +40,22 @@ void runtime_marker_list(void)
}
- buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+ buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_FALSE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
clEnqueueMarkerWithWaitList(queue, 0, NULL, &ev[3]);
- clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+ clEnqueueWriteBuffer(queue, buf[1], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 1, &ev[3], &ev[4]);
- OCL_FINISH();
clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
- OCL_ASSERT(status == CL_COMPLETE);
+ OCL_ASSERT(status != CL_COMPLETE);
OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+ OCL_FINISH();
+
clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
OCL_ASSERT(status == CL_COMPLETE);
- OCL_FINISH();
-
for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
OCL_ASSERT(status <= CL_COMPLETE);
diff --git a/utests/runtime_pipe_query.cpp b/utests/runtime_pipe_query.cpp
new file mode 100644
index 0000000..3ce8258
--- /dev/null
+++ b/utests/runtime_pipe_query.cpp
@@ -0,0 +1,15 @@
+#include <string.h>
+#include "utest_helper.hpp"
+static void runtime_pipe_query(void) {
+ const size_t w = 16;
+ const size_t sz = 8;
+ cl_uint retnum, retsz;
+ /* pipe write kernel */
+ OCL_CALL2(clCreatePipe, buf[0], ctx, 0, sz, w, NULL);
+ OCL_CALL(clGetPipeInfo, buf[0], CL_PIPE_MAX_PACKETS, sizeof(retnum), &retnum, NULL);
+ OCL_CALL(clGetPipeInfo, buf[0], CL_PIPE_PACKET_SIZE, sizeof(retsz), &retsz, NULL);
+
+ /*Check result */
+ OCL_ASSERT(sz == retsz && w == retnum);
+}
+MAKE_UTEST_FROM_FUNCTION(runtime_pipe_query);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
index 67e3bf1..e1282df 100644
--- a/utests/setenv.sh.in
+++ b/utests/setenv.sh.in
@@ -2,7 +2,9 @@
#
export OCL_BITCODE_LIB_PATH=@LOCAL_OCL_BITCODE_BIN@
export OCL_HEADER_FILE_DIR=@LOCAL_OCL_HEADER_DIR@
+export OCL_BITCODE_LIB_20_PATH=@LOCAL_OCL_BITCODE_BIN_20@
export OCL_PCH_PATH=@LOCAL_OCL_PCH_OBJECT@
+export OCL_PCH_20_PATH=@LOCAL_OCL_PCH_OBJECT_20@
export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
export OCL_GBE_PATH=@LOCAL_GBE_OBJECT_DIR@
export OCL_INTERP_PATH=@LOCAL_INTERP_OBJECT_DIR@
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 4f50f3f..b57d2ad 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -57,7 +57,7 @@ __thread size_t locals[3] = {};
float ULPSIZE_FAST_MATH = 10000.;
__attribute__ ((visibility ("internal"))) clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR = NULL;
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
Display *xDisplay;
EGLDisplay eglDisplay;
EGLContext eglContext = NULL;
@@ -171,6 +171,8 @@ cl_test_channel_order_string(cl_channel_order order)
DECL_ORDER(Rx);
DECL_ORDER(RGx);
DECL_ORDER(RGBx);
+ DECL_ORDER(sRGBA);
+ DECL_ORDER(sBGRA);
#undef DECL_ORDER
default: return "Unsupported image channel order";
};
@@ -435,7 +437,7 @@ cl_ocl_init(void)
cl_int status = CL_SUCCESS;
cl_uint platform_n;
size_t i;
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
bool hasGLExt = false;
#endif
cl_context_properties *props = NULL;
@@ -464,7 +466,7 @@ cl_ocl_init(void)
GET_DEVICE_STR_INFO(version, VERSION);
GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
hasGLExt = true;
}
@@ -476,7 +478,7 @@ cl_ocl_init(void)
goto error;
}
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
if (hasGLExt) {
int i = 0;
props = new cl_context_properties[7];
@@ -556,7 +558,7 @@ cl_ocl_destroy(void)
{
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
if (eglContext != NULL) {
cl_ocl_destroy_egl_window();
eglContext = NULL;
@@ -917,6 +919,26 @@ int cl_check_subgroups(void)
return 1;
}
+int cl_check_subgroups_short(void)
+{
+ if (!cl_check_subgroups())
+ return 0;
+ std::string extStr;
+ size_t param_value_size;
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, ¶m_value_size);
+ std::vector<char> param_value(param_value_size);
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+ param_value.empty() ? NULL : ¶m_value.front(), ¶m_value_size);
+ if (!param_value.empty())
+ extStr = std::string(¶m_value.front(), param_value_size-1);
+
+ if (std::strstr(extStr.c_str(), "cl_intel_subgroups_short") == NULL) {
+ printf("No cl_intel_subgroups_short, Skip!");
+ return 0;
+ }
+ return 1;
+}
+
int cl_check_ocl20(void)
{
size_t param_value_size;
@@ -970,110 +992,119 @@ int cl_check_half(void)
return 1;
}
-uint32_t __half_to_float(uint16_t h, bool* isInf, bool* infSign)
+uint32_t __half_to_float(uint16_t h, bool *isInf, bool *infSign)
{
- struct __FP32 {
- uint32_t mantissa:23;
- uint32_t exponent:8;
- uint32_t sign:1;
- };
- struct __FP16 {
- uint32_t mantissa:10;
- uint32_t exponent:5;
- uint32_t sign:1;
- };
- uint32_t f;
- __FP32 o;
- memset(&o, 0, sizeof(o));
- __FP16 i;
- memcpy(&i, &h, sizeof(uint16_t));
+ uint32_t out_val = 0;
+ uint16_t sign = (h & 0x8000) >> 15;
+ uint16_t exp = (h & 0x7c00) >> 10;
+ uint16_t fraction = h & 0x03ff;
if (isInf)
*isInf = false;
if (infSign)
*infSign = false;
- if (i.exponent == 0 && i.mantissa == 0) // (Signed) zero
- o.sign = i.sign;
- else {
- if (i.exponent == 0) { // Denormal (converts to normalized)
- // Adjust mantissa so it's normalized (and keep
- // track of exponent adjustment)
- int e = -1;
- uint m = i.mantissa;
- do {
- e++;
- m <<= 1;
- } while ((m & 0x400) == 0);
-
- o.mantissa = (m & 0x3ff) << 13;
- o.exponent = 127 - 15 - e;
- o.sign = i.sign;
- } else if (i.exponent == 0x1f) { // Inf/NaN
- // NOTE: Both can be handled with same code path
- // since we just pass through mantissa bits.
- o.mantissa = i.mantissa << 13;
- o.exponent = 255;
- o.sign = i.sign;
-
- if (isInf) {
- *isInf = (i.mantissa == 0);
- if (infSign)
- *infSign = !i.sign;
- }
- } else { // Normalized number
- o.mantissa = i.mantissa << 13;
- o.exponent = 127 - 15 + i.exponent;
- o.sign = i.sign;
+ if (exp == 0 && fraction == 0) { // (Signed) zero
+ return (sign << 31);
+ }
+
+ if (exp == 0) { // subnormal mode
+ assert(fraction > 0);
+ exp = -1;
+ do {
+ fraction = fraction << 1;
+ exp++;
+ } while ((fraction & 0x400) == 0);
+ exp = 127 - exp - 15;
+ out_val = (sign << 31) | ((exp & 0xff) << 23) | ((fraction & 0x3ff) << 13);
+ return out_val;
+ }
+
+ if (exp == 0x1f) { // inf or NAN
+ if (fraction == 0) { // inf
+ out_val = (sign << 31) | (255 << 23);
+ if (isInf)
+ *isInf = true;
+ if (infSign)
+ *infSign = (sign == 0) ? 1 : 0;
+
+ return out_val;
+ } else { // NAN mode
+ out_val = (sign << 31) | (255 << 23) | 0x7fffff;
+ return out_val;
}
}
- memcpy(&f, &o, sizeof(uint32_t));
- return f;
+ // Easy case, just convert.
+ exp = 127 - 15 + exp;
+ out_val = (sign << 31) | ((exp & 0xff) << 23) | ((fraction & 0x3ff) << 13);
+ return out_val;
}
-
uint16_t __float_to_half(uint32_t x)
{
- uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
- uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
- unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
-
- /* If zero, or denormal, or exponent underflows too much for a denormal
- * half, return signed zero. */
- if (e < 103)
- return bits;
-
- /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
- if (e > 142) {
- bits |= 0x7c00u;
- /* If exponent was 0xff and one mantissa bit was set, it means NaN,
- * not Inf, so make sure we set one mantissa bit too. */
- bits |= e == 255 && (x & 0x007fffffu);
- return bits;
+ uint16_t sign = (x & 0x80000000) >> 31;
+ uint16_t exp = (x & 0x7F800000) >> 23;
+ uint32_t fraction = (x & 0x7fffff);
+ uint16_t out_val = 0;
+
+ /* Handle the float NAN format. */
+ if (exp == 0xFF && fraction != 0) {
+ /* return a NAN half. */
+ out_val = (sign << 15) | (0x7C00) | (fraction & 0x3ff);
+ return out_val;
}
- /* If exponent underflows but not too much, return a denormal */
- if (e < 113) {
- m |= 0x0800u;
- /* Extra rounding may overflow and set mantissa to 0 and exponent
- * to 1, which is OK. */
- bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
- return bits;
+ /* Float exp is from -126~127, half exp is from -14~15 */
+ if (exp - 127 > 15) { // Should overflow.
+ /* return +- inf. */
+ out_val = (sign << 15) | (0x7C00);
+ return out_val;
}
- bits |= ((e - 112) << 10) | (m >> 1);
- /* Extra rounding. An overflow will set mantissa to 0 and increment
- * the exponent, which is OK. */
- bits += m & 1;
- return bits;
+ /* half has 10 bits fraction, so have chance to convet to
+ (-1)^sign X 2^(-14) X 0.fraction form. But if the
+ exp - 127 < -14 - 10, we must have unerflow. */
+ if (exp < -14 + 127 - 10) { // Should underflow.
+ /* Return zero without subnormal numbers. */
+ out_val = (sign << 15);
+ return out_val;
+ }
+
+ if (exp < -14 + 127) { //May underflow, but may use subnormal numbers
+ int shift = -(exp - 127 + 14);
+ assert(shift > 0);
+ assert(shift <= 10);
+ fraction = fraction | 0x0800000; // in 1.significantbits2, add the 1
+ fraction = fraction >> shift;
+ // To half fraction
+ fraction = (fraction & 0x7ff000) >> 12;
+ out_val = (sign << 15) | ((fraction >> 1) & 0x3ff);
+ if (fraction & 0x01)
+ out_val++;
+ return out_val;
+ }
+
+ /* Easy case, just convert. */
+ fraction = (fraction & 0x7ff000) >> 12;
+ exp = exp - 127 + 15;
+ assert(exp > 0);
+ assert(exp < 0x01f);
+ out_val = (sign << 15) | (exp << 10) | ((fraction >> 1) & 0x3ff);
+ if (fraction & 0x01)
+ out_val++;
+ return out_val;
}
-uint32_t as_uint(float f) {
+
+uint32_t as_uint(float f)
+{
union uint32_cast _tmp;
_tmp._float = f;
return _tmp._uint;
}
-float as_float(uint32_t i) {
+
+float as_float(uint32_t i)
+{
union uint32_cast _tmp;
_tmp._uint = i;
return _tmp._float;
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index a6e8180..a761325 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -39,13 +39,15 @@
#define __thread
#endif
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
#define EGL_WINDOW_WIDTH 256
#define EGL_WINDOW_HEIGHT 256
+#define GL_GLEXT_PROTOTYPES
#include <GL/gl.h>
+#include <GL/glext.h>
#include <EGL/egl.h>
#include <EGL/eglext.h>
-#include <CL/cl_gl.h>
+#include <CL/cl_gl.h>
extern EGLDisplay eglDisplay;
extern EGLContext eglContext;
@@ -126,6 +128,9 @@ union uint32_cast {
#define OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(ID) \
OCL_CALL(clEnqueueAcquireGLObjects, queue, 1, &buf[ID], 0, 0, 0)
+#define OCL_ENQUEUE_RELEASE_GL_OBJECTS(ID) \
+ OCL_CALL(clEnqueueReleaseGLObjects, queue, 1, &buf[ID], 0, 0, 0)
+
#define OCL_SWAP_EGL_BUFFERS() \
eglSwapBuffers(eglDisplay, eglSurface);
@@ -316,4 +321,6 @@ extern uint32_t __half_to_float(uint16_t h, bool* isInf = NULL, bool* infSign =
extern uint16_t __float_to_half(uint32_t x);
extern float as_float(uint32_t i);
extern uint32_t as_uint(float f);
+/* Check is intel subgroups short enabled. */
+extern int cl_check_subgroups_short(void);
#endif /* __UTEST_HELPER_HPP__ */
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git
More information about the Pkg-opencl-commits
mailing list